add extract_with_metadata method in core and corresponding ut

adbar · Dec 9, 2024 · b534c66 · b534c66
1 parent 7067937
commit b534c66
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 2 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -21,7 +21,7 @@
     from charset_normalizer import detect
 
 import trafilatura.htmlprocessing
-from trafilatura import bare_extraction, extract, xml
+from trafilatura import bare_extraction, extract, extract_with_metadata, xml
 from trafilatura.core import Extractor
 from trafilatura.external import sanitize_tree, try_justext, try_readability
 from trafilatura.main_extractor import (handle_formatting, handle_image,
@@ -436,6 +436,59 @@ def test_formatting():
     assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result
 
 
+def test_extract_with_metadata():
+    '''Test extract_with_metadata method'''
+    url = 'http://aa.bb/cc.html'
+    my_document = html.fromstring("""<html>
+        <head></head>
+        <body>
+        <article>
+        <p>AAA, <p>BBB</p>, CCC.</p>
+        </article>
+        </body>
+        </html>
+    """)
+    parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
+    content = parsed_doc.text
+    assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+    assert url == parsed_doc.url and parsed_doc.date is None and parsed_doc.title is None
+
+    my_document = html.fromstring("""<html>
+        <head><title>title</title></head>
+        <body>
+        <article>
+        <div>May 24, 2021</div>
+        <p>AAA, <p>BBB</p>, CCC.</p>
+        </article>
+        </body>
+        </html>
+    """)
+    parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
+    content = parsed_doc.text
+    assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+    assert url == parsed_doc.url and '2021-05-24' == parsed_doc.date and 'title' == parsed_doc.title
+
+    parsed_doc = extract_with_metadata(my_document, output_format='xml')
+    assert 'AAA, BBB , CCC.' == parsed_doc.raw_text and 'ee7d2fb6fcf2837d' == parsed_doc.fingerprint
+    content = parsed_doc.text
+    assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+
+    my_document = html.fromstring("""<html>
+        <head><meta http-equiv="content-language" content="es"></head>
+        <body>
+        <article>
+        <p>AAA, <p>BBB</p>, CCC.</p>
+        </article>
+        </body>
+        </html>
+    """)
+    parsed_doc = extract_with_metadata(my_document, target_language='en', fast=True)
+    assert parsed_doc is None
+
+    with pytest.raises(ValueError) as err:
+        extract_with_metadata(my_document, output_format="python")
+
+
 def test_external():
     '''Test external components'''
     options = DEFAULT_OPTIONS
@@ -1637,6 +1690,7 @@ def test_deprecations():
     test_trim()
     test_input()
     test_formatting()
+    test_extract_with_metadata()
     test_exotic_tags()
     test_images()
     test_links()

diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py
@@ -13,7 +13,7 @@
 import logging
 
 from .baseline import baseline, html2txt
-from .core import bare_extraction, extract
+from .core import bare_extraction, extract, extract_with_metadata
 from .downloads import fetch_response, fetch_url
 from .metadata import extract_metadata
 from .utils import load_html
@@ -25,6 +25,7 @@
     "baseline",
     "extract",
     "extract_metadata",
+    "extract_with_metadata",
     "fetch_response",
     "fetch_url",
     "html2txt",

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -486,3 +486,116 @@ def extract(
 
     # return
     return determine_returnstring(document, options)
+
+
+def extract_with_metadata(
+    filecontent: Any,
+    url: Optional[str] = None,
+    record_id: Optional[str] = None,
+    fast: bool = False,
+    favor_precision: bool = False,
+    favor_recall: bool = False,
+    include_comments: bool = True,
+    output_format: str = "txt",
+    tei_validation: bool = False,
+    target_language: Optional[str] = None,
+    include_tables: bool = True,
+    include_images: bool = False,
+    include_formatting: bool = False,
+    include_links: bool = False,
+    deduplicate: bool = False,
+    date_extraction_params: Optional[Dict[str, Any]] = None,
+    url_blacklist: Optional[Set[str]] = None,
+    author_blacklist: Optional[Set[str]] = None,
+    settingsfile: Optional[str] = None,
+    prune_xpath: Optional[Any] = None,
+    config: Any = DEFAULT_CONFIG,
+    options: Optional[Extractor] = None,
+) -> Optional[Document]:
+    """Main function exposed by the package:
+       Wrapper for text extraction and conversion to chosen output format.
+       This method also returns document metadata.
+
+    Args:
+        filecontent: HTML code as string.
+        url: URL of the webpage.
+        record_id: Add an ID to the metadata.
+        fast: Use faster heuristics and skip backup extraction.
+        no_fallback: Will be deprecated, use "fast" instead.
+        favor_precision: prefer less text but correct extraction.
+        favor_recall: when unsure, prefer more text.
+        include_comments: Extract comments along with the main text.
+        output_format: Define an output format:
+            "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
+        tei_validation: Validate the XML-TEI output with respect to the TEI standard.
+        target_language: Define a language to discard invalid documents (ISO 639-1 format).
+        include_tables: Take into account information within the HTML <table> element.
+        include_images: Take images into account (experimental).
+        include_formatting: Keep structural elements related to formatting
+            (only valuable if output_format is set to XML).
+        include_links: Keep links along with their targets (experimental).
+        deduplicate: Remove duplicate segments and documents.
+        date_extraction_params: Provide extraction parameters to htmldate as dict().
+        url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
+        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
+        settingsfile: Use a configuration file to override the standard settings.
+        prune_xpath: Provide an XPath expression to prune the tree before extraction.
+            can be str or list of str.
+        config: Directly provide a configparser configuration.
+        options: Directly provide a whole extractor configuration.
+
+    Returns:
+        Document metadata with content string in the desired format or None.
+    """
+    # regroup extraction options
+    if not options or not isinstance(options, Extractor):
+        options = Extractor(
+            config=use_config(settingsfile, config),
+            output_format=output_format,
+            fast=fast,
+            precision=favor_precision,
+            recall=favor_recall,
+            comments=include_comments,
+            formatting=include_formatting,
+            links=include_links,
+            images=include_images,
+            tables=include_tables,
+            dedup=deduplicate,
+            lang=target_language,
+            url=url,
+            with_metadata=True,
+            only_with_metadata=False,
+            tei_validation=tei_validation,
+            author_blacklist=author_blacklist,
+            url_blacklist=url_blacklist,
+            date_params=date_extraction_params,
+        )
+
+    # extraction
+    document = bare_extraction(
+        filecontent,
+        options=options,
+        as_dict=False,
+        prune_xpath=prune_xpath,
+    )
+
+    # post-processing
+    if not document or not isinstance(document, Document):
+        return None
+
+    if options.format not in TXT_FORMATS:
+        # control output
+        if options.format == "python":
+            raise ValueError(
+                "'python' format only usable in bare_extraction() function"
+            )
+        # add record ID to metadata
+        document.id = record_id
+        # calculate fingerprint
+        if document.raw_text is not None:
+            document.fingerprint = content_fingerprint(
+                str(document.title) + " " + str(document.raw_text)
+            )
+
+    document.text = determine_returnstring(document, options)
+    return document