Skip to content

Commit

Permalink
add extract_with_metadata method in core and corresponding ut
Browse files Browse the repository at this point in the history
  • Loading branch information
CodyInnowhere authored and CodyInnowhere committed Dec 9, 2024
1 parent 7067937 commit b534c66
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 2 deletions.
56 changes: 55 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from charset_normalizer import detect

import trafilatura.htmlprocessing
from trafilatura import bare_extraction, extract, xml
from trafilatura import bare_extraction, extract, extract_with_metadata, xml
from trafilatura.core import Extractor
from trafilatura.external import sanitize_tree, try_justext, try_readability
from trafilatura.main_extractor import (handle_formatting, handle_image,
Expand Down Expand Up @@ -436,6 +436,59 @@ def test_formatting():
assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result


def test_extract_with_metadata():
'''Test extract_with_metadata method'''
url = 'http://aa.bb/cc.html'
my_document = html.fromstring("""<html>
<head></head>
<body>
<article>
<p>AAA, <p>BBB</p>, CCC.</p>
</article>
</body>
</html>
""")
parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
content = parsed_doc.text
assert 'AAA' in content and 'BBB' in content and 'CCC' in content
assert url == parsed_doc.url and parsed_doc.date is None and parsed_doc.title is None

my_document = html.fromstring("""<html>
<head><title>title</title></head>
<body>
<article>
<div>May 24, 2021</div>
<p>AAA, <p>BBB</p>, CCC.</p>
</article>
</body>
</html>
""")
parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
content = parsed_doc.text
assert 'AAA' in content and 'BBB' in content and 'CCC' in content
assert url == parsed_doc.url and '2021-05-24' == parsed_doc.date and 'title' == parsed_doc.title

parsed_doc = extract_with_metadata(my_document, output_format='xml')
assert 'AAA, BBB , CCC.' == parsed_doc.raw_text and 'ee7d2fb6fcf2837d' == parsed_doc.fingerprint
content = parsed_doc.text
assert 'AAA' in content and 'BBB' in content and 'CCC' in content

my_document = html.fromstring("""<html>
<head><meta http-equiv="content-language" content="es"></head>
<body>
<article>
<p>AAA, <p>BBB</p>, CCC.</p>
</article>
</body>
</html>
""")
parsed_doc = extract_with_metadata(my_document, target_language='en', fast=True)
assert parsed_doc is None

with pytest.raises(ValueError) as err:
extract_with_metadata(my_document, output_format="python")


def test_external():
'''Test external components'''
options = DEFAULT_OPTIONS
Expand Down Expand Up @@ -1637,6 +1690,7 @@ def test_deprecations():
test_trim()
test_input()
test_formatting()
test_extract_with_metadata()
test_exotic_tags()
test_images()
test_links()
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import logging

from .baseline import baseline, html2txt
from .core import bare_extraction, extract
from .core import bare_extraction, extract, extract_with_metadata
from .downloads import fetch_response, fetch_url
from .metadata import extract_metadata
from .utils import load_html
Expand All @@ -25,6 +25,7 @@
"baseline",
"extract",
"extract_metadata",
"extract_with_metadata",
"fetch_response",
"fetch_url",
"html2txt",
Expand Down
113 changes: 113 additions & 0 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,116 @@ def extract(

# return
return determine_returnstring(document, options)


def extract_with_metadata(
filecontent: Any,
url: Optional[str] = None,
record_id: Optional[str] = None,
fast: bool = False,
favor_precision: bool = False,
favor_recall: bool = False,
include_comments: bool = True,
output_format: str = "txt",
tei_validation: bool = False,
target_language: Optional[str] = None,
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
url_blacklist: Optional[Set[str]] = None,
author_blacklist: Optional[Set[str]] = None,
settingsfile: Optional[str] = None,
prune_xpath: Optional[Any] = None,
config: Any = DEFAULT_CONFIG,
options: Optional[Extractor] = None,
) -> Optional[Document]:
"""Main function exposed by the package:
Wrapper for text extraction and conversion to chosen output format.
This method also returns document metadata.
Args:
filecontent: HTML code as string.
url: URL of the webpage.
record_id: Add an ID to the metadata.
fast: Use faster heuristics and skip backup extraction.
no_fallback: Will be deprecated, use "fast" instead.
favor_precision: prefer less text but correct extraction.
favor_recall: when unsure, prefer more text.
include_comments: Extract comments along with the main text.
output_format: Define an output format:
"csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
tei_validation: Validate the XML-TEI output with respect to the TEI standard.
target_language: Define a language to discard invalid documents (ISO 639-1 format).
include_tables: Take into account information within the HTML <table> element.
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
prune_xpath: Provide an XPath expression to prune the tree before extraction.
can be str or list of str.
config: Directly provide a configparser configuration.
options: Directly provide a whole extractor configuration.
Returns:
Document metadata with content string in the desired format or None.
"""
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
config=use_config(settingsfile, config),
output_format=output_format,
fast=fast,
precision=favor_precision,
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
links=include_links,
images=include_images,
tables=include_tables,
dedup=deduplicate,
lang=target_language,
url=url,
with_metadata=True,
only_with_metadata=False,
tei_validation=tei_validation,
author_blacklist=author_blacklist,
url_blacklist=url_blacklist,
date_params=date_extraction_params,
)

# extraction
document = bare_extraction(
filecontent,
options=options,
as_dict=False,
prune_xpath=prune_xpath,
)

# post-processing
if not document or not isinstance(document, Document):
return None

if options.format not in TXT_FORMATS:
# control output
if options.format == "python":
raise ValueError(
"'python' format only usable in bare_extraction() function"
)
# add record ID to metadata
document.id = record_id
# calculate fingerprint
if document.raw_text is not None:
document.fingerprint = content_fingerprint(
str(document.title) + " " + str(document.raw_text)
)

document.text = determine_returnstring(document, options)
return document

0 comments on commit b534c66

Please sign in to comment.