adbar · adbar · Oct 24, 2023 · Oct 19, 2023 · Oct 24, 2023
diff --git a/tests/resources/newsettings.cfg b/tests/resources/newsettings.cfg
@@ -31,3 +31,5 @@ EXTRACTION_TIMEOUT = 0
 MIN_DUPLCHECK_SIZE = 10
 MAX_REPETITIONS = 3
 
+# Extraction option for Htmldate
+EXTENSIVE_DATE_SEARCH = off
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -711,6 +711,10 @@ def test_extraction_options():
     assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
     # assert extract(my_html) is None
 
+    my_html = '<html><head/><body>' + '<p>ABC def ghi jkl.</p>'*1000 + '<p>Posted on 1st Dec 2019<.</p></body></html>'
+    assert bare_extraction(my_html, config=ZERO_CONFIG)["date"] is not None
+    assert bare_extraction(my_html, config=NEW_CONFIG)["date"] is None
+
 
 def test_precision_recall():
     '''test precision- and recall-oriented settings'''

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -913,18 +913,27 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
 
         # extract metadata if necessary
         if output_format != 'txt':
+
+            if not date_extraction_params:
+                date_extraction_params = {
+                    "extensive_search": config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH'),
+                }
+
             document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)
+
             # cut short if extracted URL in blacklist
             if document.url in url_blacklist:
                 LOGGER.warning('blacklisted URL: %s', url)
                 raise ValueError
+
             # cut short if core elements are missing
             if only_with_metadata is True and any(
                     x is None for x in
                     [document.date, document.title, document.url]
             ):
                 LOGGER.error('no metadata for URL %s', url)
                 raise ValueError
+
         else:
             document = Document()
 

diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
@@ -26,3 +26,5 @@ EXTRACTION_TIMEOUT = 30
 MIN_DUPLCHECK_SIZE = 100
 MAX_REPETITIONS = 2
 
+# Extraction option for Htmldate
+EXTENSIVE_DATE_SEARCH = on