diff --git a/pytesseract/__init__.py b/pytesseract/__init__.py index 40164fa9..32573175 100644 --- a/pytesseract/__init__.py +++ b/pytesseract/__init__.py @@ -2,6 +2,7 @@ from .pytesseract import ALTONotSupported from .pytesseract import get_languages from .pytesseract import get_tesseract_version +from .pytesseract import has_libcurl from .pytesseract import image_to_alto_xml from .pytesseract import image_to_boxes from .pytesseract import image_to_data @@ -14,6 +15,7 @@ from .pytesseract import TesseractError from .pytesseract import TesseractNotFoundError from .pytesseract import TSVNotSupported +from .pytesseract import URLNotSupported __version__ = '0.3.13' diff --git a/pytesseract/pytesseract.py b/pytesseract/pytesseract.py index 37837f4c..5c36b7db 100644 --- a/pytesseract/pytesseract.py +++ b/pytesseract/pytesseract.py @@ -81,6 +81,7 @@ TESSERACT_MIN_VERSION = Version('3.05') TESSERACT_ALTO_VERSION = Version('4.1.0') +TESSERACT_URL_VERSION = Version('4.1.1') class Output: @@ -124,6 +125,14 @@ def __init__(self): ) +class URLNotSupported(EnvironmentError): + def __init__(self): + super().__init__( + 'URL input not supported. ' + 'Tesseract >= 4.1.1 built with libcurl required', + ) + + def kill(process, code): process.terminate() try: @@ -209,7 +218,14 @@ def save(image): try: with NamedTemporaryFile(prefix='tess_', delete=False) as f: if isinstance(image, str): - yield f.name, realpath(normpath(normcase(image))) + if image.startswith(('http:', 'https:')): + if get_tesseract_version( + cached=True, + ) < TESSERACT_URL_VERSION or not has_libcurl(cached=True): + raise URLNotSupported() + yield f.name, image + else: + yield f.name, realpath(normpath(normcase(image))) return image, extension = prepare(image) input_file_name = f'{f.name}_input{extsep}{extension}' @@ -470,6 +486,24 @@ def get_tesseract_version(): return version +@run_once +def has_libcurl(): + """ + Returns True if tesseract-ocr was installed with libcurl or False otherwise + """ + try: + output = subprocess.check_output( + [tesseract_cmd, '--version'], + stderr=subprocess.STDOUT, + env=environ, + stdin=subprocess.DEVNULL, + ) + except OSError: + raise TesseractNotFoundError() + + return 'libcurl' in output.decode(DEFAULT_ENCODING) + + def image_to_string( image, lang=None, diff --git a/tests/pytesseract_test.py b/tests/pytesseract_test.py index 52efa2dd..d9d111d6 100644 --- a/tests/pytesseract_test.py +++ b/tests/pytesseract_test.py @@ -14,6 +14,7 @@ from pytesseract import ALTONotSupported from pytesseract import get_languages from pytesseract import get_tesseract_version +from pytesseract import has_libcurl from pytesseract import image_to_alto_xml from pytesseract import image_to_boxes from pytesseract import image_to_data @@ -24,6 +25,7 @@ from pytesseract import run_and_get_multiple_output from pytesseract import TesseractNotFoundError from pytesseract import TSVNotSupported +from pytesseract import URLNotSupported from pytesseract.pytesseract import file_to_dict from pytesseract.pytesseract import numpy_installed from pytesseract.pytesseract import pandas_installed @@ -45,11 +47,16 @@ IS_PYTHON_3 = not IS_PYTHON_2 TESSERACT_VERSION = tuple(get_tesseract_version().release) # to skip tests +HAS_LIBCURL = has_libcurl() # to skip tests TESTS_DIR = path.dirname(path.abspath(__file__)) DATA_DIR = path.join(TESTS_DIR, 'data') TESSDATA_DIR = path.join(TESTS_DIR, 'tessdata') TEST_JPEG = path.join(DATA_DIR, 'test.jpg') +TEST_JPEG_URL = ( + 'https://github.com/madmaze/pytesseract' + '/blob/master/tests/data/test.jpg?raw=true' +) pytestmark = pytest.mark.pytesseract # used marker for the module string_type = unicode if IS_PYTHON_2 else str # noqa: 821 @@ -121,6 +128,19 @@ def test_image_to_string_with_image_type(test_file): assert 'The quick brown dog' in image_to_string(test_file_path, 'eng') +@pytest.mark.parametrize( + 'test_file', + [TEST_JPEG_URL], + ids=['jpeg_url'], +) +def test_image_to_string_with_url(test_file): + # Tesseract-ocr supports image URLs from version 4.1.1 + # and must be built with libcurl. + if TESSERACT_VERSION < (4, 1, 1) or not HAS_LIBCURL: + pytest.skip('skip url test') + assert 'The quick brown dog' in image_to_string(test_file) + + @pytest.mark.parametrize( 'test_file', [TEST_JPEG, Image.open(TEST_JPEG)], @@ -298,6 +318,15 @@ def test_image_to_data__pandas_support(test_file_small): image_to_data(test_file_small, output_type=Output.DATAFRAME) +@pytest.mark.skipif( + TESSERACT_VERSION >= (4, 1, 1) and HAS_LIBCURL, + reason='requires tesseract < 4.1.1 or tesseract built without libcurl', +) +def test_image_to_string_url_support(): + with pytest.raises(URLNotSupported): + image_to_string(TEST_JPEG_URL) + + @pytest.mark.skipif( TESSERACT_VERSION[:2] < (3, 5), reason='requires tesseract >= 3.05',