Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add htmldate extensive search to config #434

Merged
merged 2 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/resources/newsettings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ EXTRACTION_TIMEOUT = 0
MIN_DUPLCHECK_SIZE = 10
MAX_REPETITIONS = 3

# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = off
4 changes: 4 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,10 @@ def test_extraction_options():
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
# assert extract(my_html) is None

my_html = '<html><head/><body>' + '<p>ABC def ghi jkl.</p>'*1000 + '<p>Posted on 1st Dec 2019<.</p></body></html>'
assert bare_extraction(my_html, config=ZERO_CONFIG)["date"] is not None
assert bare_extraction(my_html, config=NEW_CONFIG)["date"] is None


def test_precision_recall():
'''test precision- and recall-oriented settings'''
Expand Down
9 changes: 9 additions & 0 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,18 +913,27 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,

# extract metadata if necessary
if output_format != 'txt':

if not date_extraction_params:
date_extraction_params = {
"extensive_search": config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH'),
}

document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)

# cut short if extracted URL in blacklist
if document.url in url_blacklist:
LOGGER.warning('blacklisted URL: %s', url)
raise ValueError

# cut short if core elements are missing
if only_with_metadata is True and any(
x is None for x in
[document.date, document.title, document.url]
):
LOGGER.error('no metadata for URL %s', url)
raise ValueError

else:
document = Document()

Expand Down
2 changes: 2 additions & 0 deletions trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ EXTRACTION_TIMEOUT = 30
MIN_DUPLCHECK_SIZE = 100
MAX_REPETITIONS = 2

# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = on
Loading