harmonydata · AliceeLe · Dec 7, 2024 · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,7 @@ dmypy.json
 .idea/
 
 src/log.txt
+*.html 
+*.pdf
+pdm.lock
+.pdm-python 
diff --git a/README.md b/README.md
@@ -82,6 +82,7 @@ You need a Windows, Linux or Mac system with
 * the requirements in [requirements.txt](./requirements.txt)
 * Java (if you want to extract items from PDFs)
 * [Apache Tika](https://tika.apache.org/download.html) (if you want to extract items from PDFs)
+* Install wkhtmltopdf if you want to use HTML inputs [More instructions here](https://pypi.org/project/pdfkit/)
 
 ## 🖥 Installing Harmony Python package
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ maintainers = [
 authors = [
     { name = "Thomas Wood", email = "[email protected]" },
 ]
-requires-python = ">=3.6,<=3.13.0"
+requires-python = ">=3.11,<=3.13.0"
 classifiers=[
     # see https://pypi.org/classifiers/
     "Development Status :: 5 - Production/Stable",
@@ -55,12 +55,19 @@ dependencies = [
     "scikit-learn; python_version <= '3.13'",
     "scipy==1.14.1; python_version <= '3.13'",
     "huggingface-hub==0.25.0; python_version <= '3.13'",
+    "ipykernel>=6.29.5",
+    "pdfkit>=1.0.0",
+    "transformers>=4.47.0",
+    "bs4>=0.0.2",
 ]
 
 [project.optional-dependencies]
 
 # dev - the developer dependency set, for contributors to harmony
-dev = ["check-manifest", "pytest"]
+dev = [
+    "check-manifest",
+    "pytest>=8.3.4",
+]
 
 [project.urls]
 "Documentation" = "https://harmonydata.ac.uk/"

diff --git a/src/harmony/parsing/util/tika_wrapper.py b/src/harmony/parsing/util/tika_wrapper.py
@@ -47,11 +47,11 @@ def parse_pdf_to_plain_text(contents: str) -> str:
     """
     print("Preparing data for Tika")
     content_type, content_string = contents.split(",")
-    file_in_bytes = base64.b64decode(content_string)
+    file_in_bytes = base64.urlsafe_b64decode(content_string)
 
     file = io.BytesIO(file_in_bytes)
     print("Calling Tika")
-    parsed = parser.from_buffer(file, xmlContent=True, requestOptions={'timeout': 300})
+    parsed = parser.from_buffer(file, xmlContent=True, requestOptions={"timeout": 300})
     print("Got response from Tika")
     parsed_xml = parsed["content"]
 

diff --git a/src/harmony/util/file_helper.py b/src/harmony/util/file_helper.py
@@ -28,12 +28,43 @@
 import base64
 import uuid
 from typing import List
+import pdfkit
+from bs4 import BeautifulSoup
+
 
 from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments
 from harmony.schemas.requests.text import Instrument
 from harmony.schemas.requests.text import RawFile
 
+def extract_html_title(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+    soup = BeautifulSoup(content, "html.parser")
+    return soup.title.string.strip() if soup.title else "Untitled"
+
+def convert_html_to_pdf(file_name: str) -> RawFile:
+    """
+    Convert html to pdf,
+    """
+    try:
+        # Convert HTML to PDF
+        if file_name.startswith("http"):
+            pdf_content = pdfkit.from_url(file_name, False) 
+        else:
+            pdf_content = pdfkit.from_file(file_name, False)
+
+        file_as_base64 = base64.urlsafe_b64encode(pdf_content).decode("ascii")
+
+        return RawFile(
+            file_type="pdf",
+            content="," + file_as_base64,
+            file_id=uuid.uuid4().hex,
+            file_name=file_name,
+        )
 
+    except Exception as e:
+        print(f"Error during HTML conversion and parsing: {e}")
+
 def load_instruments_from_local_file(file_name: str) -> List[Instrument]:
     """
     Open a local file (PDF, Excel, Word or TXT format) and parse it into a list of Instrument objects.
@@ -46,19 +77,26 @@ def load_instruments_from_local_file(file_name: str) -> List[Instrument]:
         file_type = "xlsx"
     elif file_name.lower().endswith("docx"):
         file_type = "docx"
+    elif file_name.lower().endswith("html"):
+        file_type = "html"
     else:
         file_type = "txt"
 
-    if file_type == "pdf" or file_type == "xlsx" or file_type == "docx":
+    if file_type in ["pdf", "xlsx", "docx"]:
         with open(
                 file_name,
                 "rb") as f:
             file_as_bytes = f.read()
 
-        file_as_base64 = base64.urlsafe_b64encode(file_as_bytes).decode('ascii')
+        file_as_base64 = base64.b64encode(file_as_bytes).decode('ascii')
+        print(f"File as Base64 (first 100 characters): {file_as_base64[:100]}")
 
-        harmony_file = RawFile(file_type=file_type, content="," + file_as_base64, file_id=uuid.uuid4().hex,
+        harmony_file = RawFile(file_type=file_type, content= "," + file_as_base64, file_id=uuid.uuid4().hex,
                                file_name=file_name)
+    elif file_type == "html":
+        harmony_file = convert_html_to_pdf(file_name)
+        title = extract_html_title(file_name)
+        harmony_file.file_name = title
     else:
         with open(
                 file_name,