ybur-yug · ctwardy · Aug 27, 2018 · Sep 3, 2018 · Sep 14, 2018 · Sep 14, 2018
diff --git a/Dockerfile b/Dockerfile
@@ -1,61 +1,43 @@
 # start with a base image
-FROM ubuntu:14.04
-
-# install dependencies
-RUN apt-get update
-RUN apt-get install -y autoconf automake libtool
-RUN apt-get install -y libpng12-dev
-RUN apt-get install -y libjpeg62-dev
-RUN apt-get install -y g++
-RUN apt-get install -y libtiff4-dev
-RUN apt-get install -y libopencv-dev libtesseract-dev
-RUN apt-get install -y git
-RUN apt-get install -y cmake
-RUN apt-get install -y build-essential
-RUN apt-get install -y libleptonica-dev
-RUN apt-get install -y liblog4cplus-dev
-RUN apt-get install -y libcurl3-dev
-RUN apt-get install -y python2.7-dev
-RUN apt-get install -y tk8.5 tcl8.5 tk8.5-dev tcl8.5-dev
-RUN apt-get build-dep -y python-imaging --fix-missing
-RUN apt-get install -y imagemagick
-RUN apt-get install -y wget
-RUN apt-get install -y python python-pip
-
-# build leptonica
-RUN wget http://www.leptonica.org/source/leptonica-1.70.tar.gz
-RUN tar -zxvf leptonica-1.70.tar.gz
-WORKDIR leptonica-1.70/
-RUN ./autobuild
-RUN ./configure
-RUN make
-RUN make install
-RUN ldconfig
-WORKDIR /
-RUN ls
-
-ADD requirements.txt /
-RUN pip install -r requirements.txt
-
-# build tesseract
-RUN wget https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.02.tar.gz
-RUN tar -zxvf tesseract-ocr-3.02.02.tar.gz
-WORKDIR tesseract-ocr/
-RUN ./autogen.sh
-RUN ./configure
-RUN make
-RUN make install
-RUN ldconfig
-RUN cd ..
-
-# download the relevant Tesseract English Language Packages
-RUN wget https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eng.tar.gz
-RUN tar -xf tesseract-ocr-3.02.eng.tar.gz
-RUN sudo cp -r tesseract-ocr/tessdata /usr/local/share/
+FROM tesseractshadow/tesseract4re
+
+# Turn off debconf messages during build
+ENV DEBIAN_FRONTEND noninteractive
+ENV TERM linux
+
+# Install system dependencies
+# Docker says run apt-get update and install together,
+# and then rm /var/lib/apt/lists to reduce image size.
+RUN apt-get update && apt-get install -y \
+    python3-pil \
+    python3-requests \
+    python3-pip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --upgrade pip
+
+
+# Add requirements.txt before rest of repo, for caching
+COPY requirements.txt /
+RUN pip3 install -r /requirements.txt
+
 
 # update working directories
-ADD ./flask_server /flask_server
+# ADD . /app
+COPY ./flask_server /flask_server
 WORKDIR /flask_server
 
+# Make debconf interactive in the running container
+ENV DEBIAN_FRONTEND teletype
+
+# Set useful ENV vars
+ENV PYTHONIOENCODING "utf-8"
+
+# Try to forward request and error logs to docker log collector
+# Not sure this works. Use /var/log/nginx/* if running nginx.
+RUN ln -sf /dev/stdout /var/log/access.log \
+	&& ln -sf /dev/stderr /var/log/error.log
+
+# Expose and run
 EXPOSE 80
-CMD ["python", "app.py"]
+CMD ["python3", "app.py"]
diff --git a/README.md b/README.md
@@ -1,7 +1,104 @@
-Welcome!
+## Python OCR Docker
 
-Check out the blog post here >> https://realpython.com/blog/python/setting-up-a-simple-ocr-server/
+This is my branch of a
+[RealPython Tutorial](https://realpython.com/blog/python/setting-up-a-simple-ocr-server/).
+I've updated it for Python3 / Unicode, and dropped most of the non-Docker bits
+so I can inherit from the tesseractshadow/tesseract4re Docker container.
+The result is a simple OCR Docker app using Tesseract. It provides:
+* cli.py - a command-line app that takes a URL and returns the text extracted from the image.
+* app.py - a small Flask app that does the same thing, but in the browser.
 
+The only preprocessing is a single call to ImageFilter.SHARPEN.
+
+
+### Changes include:
+* Updated for Python3 & unicode.
+* Refactored to use tesseractshadow/tesseract4re Docker container.
+* Allows file:/// URLs. (Relative to the _container!_)
+
+### Alternatives
+* [tesseract-ocr-re](https://github.com/tesseract-shadow/tesseract-ocr-re)
+* [tleyden/open-ocr](https://github.com/tleyden/open-ocr) Full-featured queued service. Written in Go.
+
+
+### Usage
+
+Install Docker
+
+If you are not familiar with Docker please read [Docker - Get Started.](https://docs.docker.com/get-started/).
+
+#### Quick Start: CLI
+The following should run the Docker container straight from DockerHub.
+
+```
+docker container run --publish 5000:5000 --interactive --tty ctwardy/python_ocr_tutorial:2.0 python3 cli.py
+```
+
+That should produce the following output:
+```
+===OOOO=====CCCCC===RRRRRR=====
+==OO==OO===CC=======RR===RR====
+==OO==OO===CC=======RR===RR====
+==OO==OO===CC=======RRRRRR=====
+==OO==OO===CC=======RR==RR=====
+==OO==OO===CC=======RR== RR====
+===OOOO=====CCCCC===RR====RR===
+
+A simple OCR utility
+What is the url of the image you would like to analyze?
+```
+
+Type the following:
+```
+file:///flask_server/tests/advertisement.jpg
+```
+
+to see:
+```
+The raw output from tesseract with no processing is:
+-----------------BEGIN-----------------
+b'ADVERTISEMENT.\n\nTus publication of the Works of Joan Knox, it is\nsupposed, will extend to Five Volumes. It was thought\nadvisable to commence the series with his History of\nthe Reformation in Scotland, as the work of greatest\nimportance. The next volume will thus contain the\nThird and Fourth Books, which continue the History to\nthe year 1564; at which period his historical labours\nmay be considered to terminate. But the Fifth Book,\nforming a sequel to the History, and published under\nhis name in 1644, will also be included. His Letters\nand Miscellaneous Writings will be arranged in the\nsubsequent volumes, as nearly as possible in chronolo-\ngical order ; each portion being introduced by a separate\nnotice, respecting the manuscript or printed copies from\nwhich they have been taken.\n\nIt may perhaps be expected that a Life of the Author\nshould have been prefixed to this volume. The Life of\nKnox, by Dr. M\xe2\x80\x98Crig, is however a work so universally\nknown, and of so much historical value, as to supersede\nany attempt that might be made for a detailed bio-'
+------------------END------------------
+```
+
+#### Quick Start: App
+Runs another app from the same container, pulled from DockerHub:
+```
+docker container run --publish 5000:5000 ctwardy/python_ocr_tutorial:2.0
+```
+Then visit `localhost:5000` in your browser and type `file:///flask_server/tests/advertisement.jpg` to
+see essentially the same result, but rendered more nicely. Click "Again" to try another.
+
+Stop it with Ctrl-C as usual.
+
+#### Get this from GitHub
+```
+git clone https://github.com/ctwardy/python_ocr_tutorial.git
+```
+
+#### Build the Docker image:
+From the python_ocr_tutorial folder, do:
+```docker image build --tag python_ocr_test .```
+(Note the trailing "." -- it means build from this folder.)
+
+#### Pull the Docker image:
+In case you want to pull but not run.
+```docker pull ctwardy/python_ocr_tutorial:2.0```
+
+#### Run either the CLI or the App:
+We did these above, but for reference.
+* CLI: `docker container run --publish 5000:5000 --interactive --tty
+    ctwardy/python_ocr_tutorial:2.0 python3 cli.py`
+* App: `docker container run --publish 5000:5000 ctwardy/python_ocr_tutorial:2.0`
+
+Stop the app using Ctrl-C.
+
+### TODO (or see GitHub Issues):
+* Browser app still won't display "<" characters. The returned JSON is fine,
+so it's getting sanitized in the javascript or browser.
+* Add endpoint for POSTing images directly.
+* Add file upload widget.
+* Try Tesseract4 instead of Tesseract3: tesseractshadow/tesseract4cmp
 
 
 :)
diff --git a/_app.sh b/_app.sh
diff --git a/_run.sh b/_run.sh
diff --git a/flask_server/Procfile b/flask_server/Procfile
@@ -1 +1 @@
-web: python app.py
+web: python3 app.py
diff --git a/flask_server/Procfile.dev b/flask_server/Procfile.dev
@@ -1,4 +1,4 @@
 # Procfile.dev - development
 
 # Use the Flask development server.
-web: python app.py
+web: python3 app.py
diff --git a/flask_server/app.py b/flask_server/app.py
@@ -1,7 +1,10 @@
+# -*- coding: utf-8 -*-
+
 import os
 import logging
 from logging import Formatter, FileHandler
 from flask import Flask, request, jsonify, render_template
+import json
 
 from ocr import process_image
 
@@ -16,27 +19,50 @@ def main():
 
 @app.route('/v{}/ocr'.format(_VERSION), methods=["POST"])
 def ocr():
+
+    # Read the URL
+    try:
+        url = request.get_json()['image_url']
+    except TypeError:
+        print("TypeError trying get_json(). Trying to load from string.")
+        try:
+            data = json.loads(request.data.decode('utf-8'), encoding='utf-8')
+            url = data['img_url']
+        except:
+            return jsonify(
+                {"error": "Could not get 'image_url' from the request object. Use JSON?",
+                 "data": request.data}
+            )
+    except:
+        return jsonify(
+            {"error": "Non-TypeError. Did you send {'image_url': 'http://.....'}",
+             "data": request.data }
+        )
+
+    # Process the image
+    print("URL extracted:", url)
     try:
-        url = request.json['image_url']
-        if 'jpg' in url:
-            output = process_image(url)
-            return jsonify({"output": output})
-        else:
-            return jsonify({"error": "only .jpg files, please"})
+        output = process_image(url)
+    except OSError:
+        return jsonify({"error": "URL not recognized as image.",
+                        "url": url})
     except:
         return jsonify(
-            {"error": "Did you mean to send: {'image_url': 'some_jpeg_url'}"}
+            {"error": "Unknown processing image.",
+             "request": request.data}
         )
+    app.logger.info(output)
+    return jsonify({"output": output})
 
 
 @app.errorhandler(500)
 def internal_error(error):
-    print str(error)  # ghetto logging
+    print("*** 500 ***\n{}".format(str(error)))  # ghetto logging
 
 
 @app.errorhandler(404)
 def not_found_error(error):
-    print str(error)
+    print("*** 404 ***\n{}".format(str(error)))
 
 if not app.debug:
     file_handler = FileHandler('error.log')
@@ -52,4 +78,5 @@ def not_found_error(error):
 
 if __name__ == '__main__':
     port = int(os.environ.get('PORT', 5000))
+    print("Started app.py on port: {port}")
     app.run(host='0.0.0.0', port=port)