Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update core.py #331

Merged
merged 4 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,8 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
# try with readability
temppost_algo = try_readability(backup_tree)
algo_text = trim(temppost_algo.text_content())
# unicode fix necessary on certain systems (#331)
algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8'))
len_algo = len(algo_text)
# compare
LOGGER.debug('extracted length: %s (algorithm) %s (extraction)', len_algo, len_text)
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,13 @@
from justext.utils import get_stoplist # , get_stoplists

from lxml.etree import Element, strip_tags
from lxml.html import fromstring


# own
from .htmlprocessing import convert_tags, prune_unwanted_nodes, tree_cleaning
from .readability_lxml import Document as ReadabilityDocument # fork
from .settings import JUSTEXT_LANGUAGES
from .utils import trim, HTML_PARSER
from .utils import fromstring_bytes, trim
from .xml import TEI_VALID_TAGS
from .xpaths import PAYWALL_DISCARD_XPATH, REMOVE_COMMENTS_XPATH

Expand All @@ -43,7 +42,8 @@ def try_readability(htmlinput):
# defaults: min_text_length=25, retry_length=250
try:
doc = ReadabilityDocument(htmlinput, min_text_length=25, retry_length=250)
return fromstring(doc.summary(), parser=HTML_PARSER)
# force conversion to utf-8 (see #319)
return fromstring_bytes(doc.summary())
except Exception as err:
LOGGER.warning('readability_lxml failed: %s', err)
return Element('div')
Expand Down