diff --git a/trafilatura/core.py b/trafilatura/core.py index ce586d57..bc936fbe 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -642,7 +642,8 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options): backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH) # try with readability temppost_algo = try_readability(backup_tree) - algo_text = trim(temppost_algo.text_content()) + # unicode fix necessary on certain systems (#331) + algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8')) len_algo = len(algo_text) # compare LOGGER.debug('extracted length: %s (algorithm) %s (extraction)', len_algo, len_text) diff --git a/trafilatura/external.py b/trafilatura/external.py index 3befe497..eda6ea38 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -18,14 +18,13 @@ from justext.utils import get_stoplist # , get_stoplists from lxml.etree import Element, strip_tags -from lxml.html import fromstring # own from .htmlprocessing import convert_tags, prune_unwanted_nodes, tree_cleaning from .readability_lxml import Document as ReadabilityDocument # fork from .settings import JUSTEXT_LANGUAGES -from .utils import trim, HTML_PARSER +from .utils import fromstring_bytes, trim from .xml import TEI_VALID_TAGS from .xpaths import PAYWALL_DISCARD_XPATH, REMOVE_COMMENTS_XPATH @@ -43,7 +42,8 @@ def try_readability(htmlinput): # defaults: min_text_length=25, retry_length=250 try: doc = ReadabilityDocument(htmlinput, min_text_length=25, retry_length=250) - return fromstring(doc.summary(), parser=HTML_PARSER) + # force conversion to utf-8 (see #319) + return fromstring_bytes(doc.summary()) except Exception as err: LOGGER.warning('readability_lxml failed: %s', err) return Element('div')