diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index ed4bb046..48fef5f0 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -387,7 +387,10 @@ def test_external():
teststring = f.read()
assert extract(teststring, no_fallback=True, include_tables=False) == ''
assert extract(teststring, no_fallback=False, include_tables=False) == ''
-
+ # invalid XML attributes: namespace colon in attribute key (issue #375). Those attributes should be stripped
+ bad_xml = 'Testing
Features: - Saves the cost of two dedicated phone lines.
al station using Internet or cellular technology. - Requires no change to the existing Fire Alarm Control Panel configuration. The IPGSM-4G connects directly to the primary and secondary telephone ports.
'
+ res = extract(bad_xml, output_format='xml')
+ assert "Features" in res
def test_images():
'''Test image extraction function'''
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index 3b8bbc04..546c689e 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -303,6 +303,12 @@ def sanitize_tree(tree):
preserve_space = elem.tag in SPACING_PROTECTED or parent_tag in SPACING_PROTECTED
trailing_space = elem.tag in FORMATTING_PROTECTED or parent_tag in FORMATTING_PROTECTED or preserve_space
+ for attrib_key in elem.attrib.keys():
+ # Remove invalid attributes
+ if ':' in attrib_key: # colon is reserved for namespaces in XML
+ if not elem.attrib[attrib_key] or attrib_key.split(':')[0] not in tree.nsmap:
+ elem.attrib.pop(attrib_key)
+
if elem.text:
elem.text = sanitize(elem.text, preserve_space, trailing_space)
if elem.tail: