diff --git a/.gitignore b/.gitignore index 65d346a..da5dab7 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,7 @@ dmypy.json .idea/ src/log.txt +*.html +*.pdf +pdm.lock +.pdm-python diff --git a/README.md b/README.md index b8c8297..5353d6d 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ You need a Windows, Linux or Mac system with * the requirements in [requirements.txt](./requirements.txt) * Java (if you want to extract items from PDFs) * [Apache Tika](https://tika.apache.org/download.html) (if you want to extract items from PDFs) +* Install wkhtmltopdf if you want to use HTML inputs [More instructions here](https://pypi.org/project/pdfkit/) ## 🖥 Installing Harmony Python package diff --git a/pyproject.toml b/pyproject.toml index 03ea82e..bf2dade 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ maintainers = [ authors = [ { name = "Thomas Wood", email = "thomas@fastdatascience.com" }, ] -requires-python = ">=3.6,<=3.13.0" +requires-python = ">=3.11,<=3.13.0" classifiers=[ # see https://pypi.org/classifiers/ "Development Status :: 5 - Production/Stable", @@ -55,12 +55,19 @@ dependencies = [ "scikit-learn; python_version <= '3.13'", "scipy==1.14.1; python_version <= '3.13'", "huggingface-hub==0.25.0; python_version <= '3.13'", + "ipykernel>=6.29.5", + "pdfkit>=1.0.0", + "transformers>=4.47.0", + "bs4>=0.0.2", ] [project.optional-dependencies] # dev - the developer dependency set, for contributors to harmony -dev = ["check-manifest", "pytest"] +dev = [ + "check-manifest", + "pytest>=8.3.4", +] [project.urls] "Documentation" = "https://harmonydata.ac.uk/" diff --git a/src/harmony/parsing/util/tika_wrapper.py b/src/harmony/parsing/util/tika_wrapper.py index c0cc989..fa708ae 100644 --- a/src/harmony/parsing/util/tika_wrapper.py +++ b/src/harmony/parsing/util/tika_wrapper.py @@ -47,11 +47,11 @@ def parse_pdf_to_plain_text(contents: str) -> str: """ print("Preparing data for Tika") content_type, content_string = contents.split(",") - file_in_bytes = base64.b64decode(content_string) + file_in_bytes = base64.urlsafe_b64decode(content_string) file = io.BytesIO(file_in_bytes) print("Calling Tika") - parsed = parser.from_buffer(file, xmlContent=True, requestOptions={'timeout': 300}) + parsed = parser.from_buffer(file, xmlContent=True, requestOptions={"timeout": 300}) print("Got response from Tika") parsed_xml = parsed["content"] diff --git a/src/harmony/util/file_helper.py b/src/harmony/util/file_helper.py index 49895d9..37fb34e 100644 --- a/src/harmony/util/file_helper.py +++ b/src/harmony/util/file_helper.py @@ -28,12 +28,43 @@ import base64 import uuid from typing import List +import pdfkit +from bs4 import BeautifulSoup + from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments from harmony.schemas.requests.text import Instrument from harmony.schemas.requests.text import RawFile +def extract_html_title(file_path): + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + soup = BeautifulSoup(content, "html.parser") + return soup.title.string.strip() if soup.title else "Untitled" + +def convert_html_to_pdf(file_name: str) -> RawFile: + """ + Convert html to pdf, + """ + try: + # Convert HTML to PDF + if file_name.startswith("http"): + pdf_content = pdfkit.from_url(file_name, False) + else: + pdf_content = pdfkit.from_file(file_name, False) + + file_as_base64 = base64.urlsafe_b64encode(pdf_content).decode("ascii") + + return RawFile( + file_type="pdf", + content="," + file_as_base64, + file_id=uuid.uuid4().hex, + file_name=file_name, + ) + except Exception as e: + print(f"Error during HTML conversion and parsing: {e}") + def load_instruments_from_local_file(file_name: str) -> List[Instrument]: """ Open a local file (PDF, Excel, Word or TXT format) and parse it into a list of Instrument objects. @@ -46,19 +77,26 @@ def load_instruments_from_local_file(file_name: str) -> List[Instrument]: file_type = "xlsx" elif file_name.lower().endswith("docx"): file_type = "docx" + elif file_name.lower().endswith("html"): + file_type = "html" else: file_type = "txt" - if file_type == "pdf" or file_type == "xlsx" or file_type == "docx": + if file_type in ["pdf", "xlsx", "docx"]: with open( file_name, "rb") as f: file_as_bytes = f.read() - file_as_base64 = base64.urlsafe_b64encode(file_as_bytes).decode('ascii') + file_as_base64 = base64.b64encode(file_as_bytes).decode('ascii') + print(f"File as Base64 (first 100 characters): {file_as_base64[:100]}") - harmony_file = RawFile(file_type=file_type, content="," + file_as_base64, file_id=uuid.uuid4().hex, + harmony_file = RawFile(file_type=file_type, content= "," + file_as_base64, file_id=uuid.uuid4().hex, file_name=file_name) + elif file_type == "html": + harmony_file = convert_html_to_pdf(file_name) + title = extract_html_title(file_name) + harmony_file.file_name = title else: with open( file_name, diff --git a/test_read_html.ipynb b/test_read_html.ipynb new file mode 100644 index 0000000..786b0ca --- /dev/null +++ b/test_read_html.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: transformers not available. To use transformers, run pip install sentence-transformers\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/chauanhle/Library/Mobile Documents/com~apple~CloudDocs/Documents/CMU/17-313/harmonyWrapper/harmonyAPI/harmonyapi/harmony/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from harmony import convert_pdf_to_instruments\n", + "from harmony.schemas.requests.text import RawFile\n", + "from harmony.util.file_helper import load_instruments_from_local_file\n", + "import base64" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"gad-7.pdf\", \"rb\") as file: \n", + " byte_data = file.read() \n", + "\n", + "encoded = \",\" + base64.urlsafe_b64encode(byte_data).decode(\"ascii\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='da362802746b4310851eca57aca74746', instrument_id='ca11045cd0724ba19fc0fdbe93e7a890', instrument_name='GAD-7.pdf', file_name='GAD-7.pdf', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Feeling afraid, as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='If you checked any problems, how difficult have they made it for you to do your work, take care of things at home, or get along with other people?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_file = RawFile.model_validate(RawFile.model_validate({\n", + " \"file_id\": \"d39f31718513413fbfc620c6b6135d0c\",\n", + " \"file_name\": \"GAD-7.pdf\",\n", + " \"file_type\": \"pdf\",\n", + " \"content\": encoded}))\n", + "convert_pdf_to_instruments(raw_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File as Base64 (first 100 characters): JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0NhdGFsb2cvUGFnZXMgMiAwIFIvTGFuZyhlbi1VUykgL1N0cnVjdFRy\n", + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='f671203dfe0d492d81a13f7dbf39421a', instrument_id='0df8db70628a4ae6aae5b28218d816c7', instrument_name='gad-7.pdf', file_name='gad-7.pdf', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Feeling afraid, as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='If you checked any problems, how difficult have they made it for you to do your work, take care of things at home, or get along with other people?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If this previous cells work, then this should work with no error \n", + "load_instruments_from_local_file(\"gad-7.pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use PDFKIT to convert html to pdf, then handle the content as if it comes from a pdf file" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfkit\n", + "import uuid " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='cbdabbb5f909450b846cc81bb0890133', instrument_id='4fbc45efa4144a2e8004ad89ed6c97db', instrument_name='GAD-7 Chinese.html', file_name='GAD-7 Chinese.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='⼏天', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='⼀半以上天数', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='⼏乎每天', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7 Chinese.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='2c3bfada19a149b08c0e4646309bc715', instrument_id='9638fa85bb2c41fe91d99e066573029d', instrument_name='GAD-7 Espanol.html', file_name='GAD-7 Espanol.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Durante las últimas 2 semanas, ¿con qué frecuencia ha sentido molestias por los siguientes problemas?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Varios días', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Más de la mitad de los días', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Casi todos los días', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Sentirse nervioso/a, intranquilo/a o con los nervios de punta', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='No poder dejar de preocuparse o no poder controlar la preocupación', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Preocuparse demasiado por diferentes cosas', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Dificultad para relajarse', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='Estar tan inquieto/a que es difícil permanecer sentado/a tranquilamente', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Molestarse o ponerse irritable fácilmente', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Sentir miedo como si algo terrible pudiera pasar', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7 Espanol.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='b7ef1503287843c2ae688676b26d8f31', instrument_id='d3e1849c25da4662963dc4d175387582', instrument_name='GHQ-12.html', file_name='GHQ-12.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text=\"The next questions are about how you have been feeling over the last few weeks. Have you recently been able to concentrate on whatever you're doing?\", options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Better than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Less than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Much less than usual Have you recently lost much sleep over worry?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Much more than usual Have you recently felt that you were playing a useful part in things?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='12', question_intro=None, question_text='Much less than usual Have you recently felt capable of making decisions about things?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='13', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='14', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='15', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='16', question_intro=None, question_text='Much less capable Have you recently felt constantly under strain?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='17', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='18', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='19', question_intro=None, question_text=\"Much more than usual Have you recently felt you couldn't overcome your difficulties?\", options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='20', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='21', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='22', question_intro=None, question_text='Much more than usual Have you recently been able to enjoy your normal day-to-day activities?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='23', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='24', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='25', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='26', question_intro=None, question_text='Much less than usual Have you recently been able to face up to problems?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='27', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='28', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='29', question_intro=None, question_text='Less able than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='30', question_intro=None, question_text='Much less able Have you recently been feeling unhappy or depressed?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='31', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='32', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='33', question_intro=None, question_text='Much more than usual Have you recently been losing confidence in yourself?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='34', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='35', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='36', question_intro=None, question_text='Much more than usual Have you recently been thinking of yourself as a worthless person?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='37', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='38', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='39', question_intro=None, question_text='Much more than usual Have you recently been feeling reasonably happy, all things considered?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='40', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='41', question_intro=None, question_text='About the same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='42', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GHQ-12.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='b4a36b224d1949d7b3b76fd48ccfaaa9', instrument_id='3f937da7fb08483db9187828bcf633bb', instrument_name='GAD-7.html', file_name='GAD-7.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Over the last 2 weeks, how often have you been bothered by the following problems?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Several days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='More than half the days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Nearly every day', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Feeling afraid as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7.html\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_convert_html.py b/tests/test_convert_html.py new file mode 100644 index 0000000..602596b --- /dev/null +++ b/tests/test_convert_html.py @@ -0,0 +1,124 @@ +import unittest +from harmony import load_instruments_from_local_file +from harmony.schemas.requests.text import Instrument +import tempfile +import os + + +class TestLoadInstrumentsFromLocalFile(unittest.TestCase): + def setUp(self): + """Set up temporary files for testing.""" + # English HTML content + self.html_content_english = """ + + + + + GAD-7 Form + + +
GAD-7
+

Over the last 2 weeks, how often have you been bothered by the following problems?

+ + + + + + + + + + + + + + + + + + + +
QuestionNot at all
(0)
Several days
(1)
More than half the days
(2)
Nearly every day
(3)
1. Feeling nervous, anxious, or on edge
+ + + """ + # Chinese HTML content + self.html_content_chinese = """ + + + + + 广泛性焦虑症量表(GAD-7) + + +

广泛性焦虑症量表(GAD-7)

+

在过去两个星期,有多少时候您受到以下问题所困扰?

+ + + + + + + + + + + + + + + + + + + +
问题完全没有
(0)
几天
(1)
一半以上天数
(2)
几乎每天
(3)
1. 感觉紧张、焦虑或不安
+ + + """ + # Temporary files + self.temp_file_english = tempfile.NamedTemporaryFile(delete=False, suffix=".html") + self.temp_file_english.write(self.html_content_english.encode('utf-8')) + self.temp_file_english.close() + + self.temp_file_chinese = tempfile.NamedTemporaryFile(delete=False, suffix=".html") + self.temp_file_chinese.write(self.html_content_chinese.encode('utf-8')) + self.temp_file_chinese.close() + + def tearDown(self): + """Remove temporary files after tests.""" + os.unlink(self.temp_file_english.name) + os.unlink(self.temp_file_chinese.name) + + def test_load_instruments_english_html(self): + """Test loading instruments from an English HTML file.""" + instruments = load_instruments_from_local_file(self.temp_file_english.name) + + self.assertIsInstance(instruments, list) + + self.assertGreater(len(instruments), 0) + + instrument = instruments[0] + self.assertIsInstance(instrument, Instrument) + + self.assertEqual(instrument.instrument_name, "GAD-7 Form") + self.assertGreater(len(instrument.questions), 0) + + + def test_load_instruments_chinese_html(self): + """Test loading instruments from a Chinese HTML file.""" + instruments = load_instruments_from_local_file(self.temp_file_chinese.name) + + self.assertIsInstance(instruments, list) + + self.assertGreater(len(instruments), 0) + + instrument = instruments[0] + self.assertIsInstance(instrument, Instrument) + + self.assertEqual(instrument.instrument_name, "广泛性焦虑症量表(GAD-7)") + self.assertGreater(len(instrument.questions), 0) + + +if __name__ == "__main__": + unittest.main()