diff --git a/HISTORY.rst b/HISTORY.rst index 03c9575..2d6bbc0 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -1,6 +1,11 @@ Changelog --------- +0.2.1 +^^^^^ +**release date:** 2016-02-28 +* Adding guess encoding back without python-magic dependency. + 0.2 ^^^^^ **release date:** 2016-02-27 diff --git a/cleanit/__init__.py b/cleanit/__init__.py index e377e77..870394b 100644 --- a/cleanit/__init__.py +++ b/cleanit/__init__.py @@ -1,3 +1,3 @@ __title__ = 'cleanit' __author__ = 'Rato' -__version__ = '0.2' +__version__ = '0.2.1' diff --git a/cleanit/subtitle.py b/cleanit/subtitle.py index 09d5898..144e088 100644 --- a/cleanit/subtitle.py +++ b/cleanit/subtitle.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- import logging -import pysrt +import chardet +import pysrt logger = logging.getLogger(__name__) class Subtitle(object): - def __init__(self, path, encoding=None): self.path = path - self.encoding = encoding + self.encoding = encoding if encoding else self.guess_encoding(path) self.subtitle = None def __repr__(self): @@ -22,6 +22,33 @@ def load(self): def save(self, path=None, encoding=None): self.subtitle.save(path=path if path else self.path, encoding=encoding if encoding else self.encoding) + def guess_encoding(self, path): + # always try utf-8 first + encodings = ['utf-8', 'latin-1', 'windows-1251', 'windows-1250', 'iso-8859-9', 'windows-1254', 'windows-1255', + 'windows-1256', 'shift-jis', 'gb18030', 'big5'] + + # try to decode + logger.debug('Trying encodings %r', encodings) + + with open(path, 'r') as f: + content = f.read() + for encoding in encodings: + try: + content.decode(encoding) + except UnicodeDecodeError: + pass + else: + logger.info('Guessed encoding %s', encoding) + return encoding + + logger.warning('Could not guess encoding from language') + + # fallback on chardet + encoding = chardet.detect(content)['encoding'] + logger.info('Chardet found encoding %s', encoding) + + return encoding + def clean(self, rules, clean_indexes=True): self.load() diff --git a/setup.py b/setup.py index fb01261..a3f1fcc 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,8 @@ def run_tests(self): sys.exit(errno) # requirements -install_requirements = ['appdirs>=1.4.0', 'click>=4.0', 'jsonschema>=2.5.1', 'pysrt>=1.0.1', 'pyyaml>=3.11'] +install_requirements = ['appdirs>=1.4.0', 'chardet>=2.3.0', 'click>=4.0', 'jsonschema>=2.5.1', 'pysrt>=1.0.1', + 'pyyaml>=3.11'] test_requirements = ['pytest', 'pytest-pep8', 'pytest-flakes', 'pytest-cov']