Merge pull request #197 from danielquinn/pluggable-consumers
Pluggable consumers
This commit is contained in:
		
						commit
						b7cb708053
					
				@ -8,7 +8,9 @@ matrix:
 | 
			
		||||
          env: TOXENV=py34
 | 
			
		||||
        - python: 3.5
 | 
			
		||||
          env: TOXENV=py35
 | 
			
		||||
        - python: 3.5
 | 
			
		||||
        - python: 3.6
 | 
			
		||||
          env: TOXENV=py36
 | 
			
		||||
        - python: 3.6
 | 
			
		||||
          env: TOXENV=pep8
 | 
			
		||||
 | 
			
		||||
install:
 | 
			
		||||
 | 
			
		||||
@ -4,6 +4,14 @@ Changelog
 | 
			
		||||
* 0.3.6
 | 
			
		||||
  * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
 | 
			
		||||
    correspondent or the tags for a document.
 | 
			
		||||
  * The ``content`` field is now optional, to allow for the edge case of a
 | 
			
		||||
    purely graphical document.
 | 
			
		||||
  * You can no longer add documents via the admin.  This never worked in the
 | 
			
		||||
    first place, so all I've done here is remove the link to the broken form.
 | 
			
		||||
  * The consumer code has been heavily refactored to support a pluggable
 | 
			
		||||
    interface.  Install a paperless consumer via pip and tell paperless about
 | 
			
		||||
    it with an environment variable, and you're good to go.  Proper
 | 
			
		||||
    documentation is on its way.
 | 
			
		||||
 | 
			
		||||
* 0.3.5
 | 
			
		||||
  * A serious facelift for the documents listing page wherein we drop the
 | 
			
		||||
 | 
			
		||||
@ -67,6 +67,7 @@ class DocumentAdmin(CommonAdmin):
 | 
			
		||||
 | 
			
		||||
    def created_(self, obj):
 | 
			
		||||
        return obj.created.date().strftime("%Y-%m-%d")
 | 
			
		||||
    created_.short_description = "Created"
 | 
			
		||||
 | 
			
		||||
    def thumbnail(self, obj):
 | 
			
		||||
        png_img = self._html_tag(
 | 
			
		||||
 | 
			
		||||
@ -1,35 +1,21 @@
 | 
			
		||||
import datetime
 | 
			
		||||
import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import uuid
 | 
			
		||||
import shutil
 | 
			
		||||
import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import datetime
 | 
			
		||||
import tempfile
 | 
			
		||||
import itertools
 | 
			
		||||
import subprocess
 | 
			
		||||
from multiprocessing.pool import Pool
 | 
			
		||||
 | 
			
		||||
import pyocr
 | 
			
		||||
import langdetect
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.utils import timezone
 | 
			
		||||
from paperless.db import GnuPG
 | 
			
		||||
from pyocr.tesseract import TesseractError
 | 
			
		||||
from pyocr.libtesseract.tesseract_raw import \
 | 
			
		||||
    TesseractError as OtherTesseractError
 | 
			
		||||
 | 
			
		||||
from .models import Tag, Document, FileInfo
 | 
			
		||||
from .models import Document, FileInfo, Tag
 | 
			
		||||
from .parsers import ParseError
 | 
			
		||||
from .signals import (
 | 
			
		||||
    document_consumption_started,
 | 
			
		||||
    document_consumption_finished
 | 
			
		||||
    document_consumer_declaration,
 | 
			
		||||
    document_consumption_finished,
 | 
			
		||||
    document_consumption_started
 | 
			
		||||
)
 | 
			
		||||
from .languages import ISO639
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class OCRError(Exception):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ConsumerError(Exception):
 | 
			
		||||
@ -47,13 +33,7 @@ class Consumer(object):
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    SCRATCH = settings.SCRATCH_DIR
 | 
			
		||||
    CONVERT = settings.CONVERT_BINARY
 | 
			
		||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
			
		||||
    CONSUME = settings.CONSUMPTION_DIR
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
 | 
			
		||||
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
 | 
			
		||||
@ -78,6 +58,16 @@ class Consumer(object):
 | 
			
		||||
            raise ConsumerError(
 | 
			
		||||
                "Consumption directory {} does not exist".format(self.CONSUME))
 | 
			
		||||
 | 
			
		||||
        self.parsers = []
 | 
			
		||||
        for response in document_consumer_declaration.send(self):
 | 
			
		||||
            self.parsers.append(response[1])
 | 
			
		||||
 | 
			
		||||
        if not self.parsers:
 | 
			
		||||
            raise ConsumerError(
 | 
			
		||||
                "No parsers could be found, not even the default.  "
 | 
			
		||||
                "This is a problem."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def log(self, level, message):
 | 
			
		||||
        getattr(self.logger, level)(message, extra={
 | 
			
		||||
            "group": self.logging_group
 | 
			
		||||
@ -109,6 +99,13 @@ class Consumer(object):
 | 
			
		||||
                self._ignore.append(doc)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            parser_class = self._get_parser_class(doc)
 | 
			
		||||
            if not parser_class:
 | 
			
		||||
                self.log(
 | 
			
		||||
                    "info", "No parsers could be found for {}".format(doc))
 | 
			
		||||
                self._ignore.append(doc)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            self.logging_group = uuid.uuid4()
 | 
			
		||||
 | 
			
		||||
            self.log("info", "Consuming {}".format(doc))
 | 
			
		||||
@ -119,25 +116,26 @@ class Consumer(object):
 | 
			
		||||
                logging_group=self.logging_group
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
 | 
			
		||||
            imgs = self._get_greyscale(tempdir, doc)
 | 
			
		||||
            thumbnail = self._get_thumbnail(tempdir, doc)
 | 
			
		||||
            parsed_document = parser_class(doc)
 | 
			
		||||
            thumbnail = parsed_document.get_thumbnail()
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
 | 
			
		||||
                document = self._store(self._get_ocr(imgs), doc, thumbnail)
 | 
			
		||||
 | 
			
		||||
            except OCRError as e:
 | 
			
		||||
                document = self._store(
 | 
			
		||||
                    parsed_document.get_text(),
 | 
			
		||||
                    doc,
 | 
			
		||||
                    thumbnail
 | 
			
		||||
                )
 | 
			
		||||
            except ParseError as e:
 | 
			
		||||
 | 
			
		||||
                self._ignore.append(doc)
 | 
			
		||||
                self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
 | 
			
		||||
                self._cleanup_tempdir(tempdir)
 | 
			
		||||
                self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
 | 
			
		||||
                parsed_document.cleanup()
 | 
			
		||||
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            else:
 | 
			
		||||
 | 
			
		||||
                self._cleanup_tempdir(tempdir)
 | 
			
		||||
                parsed_document.cleanup()
 | 
			
		||||
                self._cleanup_doc(doc)
 | 
			
		||||
 | 
			
		||||
                self.log(
 | 
			
		||||
@ -151,142 +149,20 @@ class Consumer(object):
 | 
			
		||||
                    logging_group=self.logging_group
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    def _get_greyscale(self, tempdir, doc):
 | 
			
		||||
    def _get_parser_class(self, doc):
 | 
			
		||||
        """
 | 
			
		||||
        Greyscale images are easier for Tesseract to OCR
 | 
			
		||||
        Determine the appropriate parser class based on the file
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        self.log("info", "Generating greyscale image from {}".format(doc))
 | 
			
		||||
        options = []
 | 
			
		||||
        for parser in self.parsers:
 | 
			
		||||
            result = parser(doc)
 | 
			
		||||
            if result:
 | 
			
		||||
                options.append(result)
 | 
			
		||||
 | 
			
		||||
        # Convert PDF to multiple PNMs
 | 
			
		||||
        pnm = os.path.join(tempdir, "convert-%04d.pnm")
 | 
			
		||||
        run_convert(
 | 
			
		||||
            self.CONVERT,
 | 
			
		||||
            "-density", str(self.DENSITY),
 | 
			
		||||
            "-depth", "8",
 | 
			
		||||
            "-type", "grayscale",
 | 
			
		||||
            doc, pnm,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Get a list of converted images
 | 
			
		||||
        pnms = []
 | 
			
		||||
        for f in os.listdir(tempdir):
 | 
			
		||||
            if f.endswith(".pnm"):
 | 
			
		||||
                pnms.append(os.path.join(tempdir, f))
 | 
			
		||||
 | 
			
		||||
        # Run unpaper in parallel on converted images
 | 
			
		||||
        with Pool(processes=self.THREADS) as pool:
 | 
			
		||||
            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
 | 
			
		||||
 | 
			
		||||
        # Return list of converted images, processed with unpaper
 | 
			
		||||
        pnms = []
 | 
			
		||||
        for f in os.listdir(tempdir):
 | 
			
		||||
            if f.endswith(".unpaper.pnm"):
 | 
			
		||||
                pnms.append(os.path.join(tempdir, f))
 | 
			
		||||
 | 
			
		||||
        return sorted(filter(lambda __: os.path.isfile(__), pnms))
 | 
			
		||||
 | 
			
		||||
    def _get_thumbnail(self, tempdir, doc):
 | 
			
		||||
        """
 | 
			
		||||
        The thumbnail of a PDF is just a 500px wide image of the first page.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        self.log("info", "Generating the thumbnail")
 | 
			
		||||
 | 
			
		||||
        run_convert(
 | 
			
		||||
            self.CONVERT,
 | 
			
		||||
            "-scale", "500x5000",
 | 
			
		||||
            "-alpha", "remove",
 | 
			
		||||
            doc, os.path.join(tempdir, "convert-%04d.png")
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        return os.path.join(tempdir, "convert-0000.png")
 | 
			
		||||
 | 
			
		||||
    def _guess_language(self, text):
 | 
			
		||||
        try:
 | 
			
		||||
            guess = langdetect.detect(text)
 | 
			
		||||
            self.log("debug", "Language detected: {}".format(guess))
 | 
			
		||||
            return guess
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.log("warning", "Language detection error: {}".format(e))
 | 
			
		||||
 | 
			
		||||
    def _get_ocr(self, imgs):
 | 
			
		||||
        """
 | 
			
		||||
        Attempts to do the best job possible OCR'ing the document based on
 | 
			
		||||
        simple language detection trial & error.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not imgs:
 | 
			
		||||
            raise OCRError("No images found")
 | 
			
		||||
 | 
			
		||||
        self.log("info", "OCRing the document")
 | 
			
		||||
 | 
			
		||||
        # Since the division gets rounded down by int, this calculation works
 | 
			
		||||
        # for every edge-case, i.e. 1
 | 
			
		||||
        middle = int(len(imgs) / 2)
 | 
			
		||||
        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
 | 
			
		||||
        guessed_language = self._guess_language(raw_text)
 | 
			
		||||
 | 
			
		||||
        if not guessed_language or guessed_language not in ISO639:
 | 
			
		||||
            self.log("warning", "Language detection failed!")
 | 
			
		||||
            if settings.FORGIVING_OCR:
 | 
			
		||||
                self.log(
 | 
			
		||||
                    "warning",
 | 
			
		||||
                    "As FORGIVING_OCR is enabled, we're going to make the "
 | 
			
		||||
                    "best with what we have."
 | 
			
		||||
                )
 | 
			
		||||
                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
			
		||||
                return raw_text
 | 
			
		||||
            raise OCRError("Language detection failed")
 | 
			
		||||
 | 
			
		||||
        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
 | 
			
		||||
            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
			
		||||
            return raw_text
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            return self._ocr(imgs, ISO639[guessed_language])
 | 
			
		||||
        except pyocr.pyocr.tesseract.TesseractError:
 | 
			
		||||
            if settings.FORGIVING_OCR:
 | 
			
		||||
                self.log(
 | 
			
		||||
                    "warning",
 | 
			
		||||
                    "OCR for {} failed, but we're going to stick with what "
 | 
			
		||||
                    "we've got since FORGIVING_OCR is enabled.".format(
 | 
			
		||||
                        guessed_language
 | 
			
		||||
                    )
 | 
			
		||||
                )
 | 
			
		||||
                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
			
		||||
                return raw_text
 | 
			
		||||
            raise OCRError(
 | 
			
		||||
                "The guessed language is not available in this instance of "
 | 
			
		||||
                "Tesseract."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def _assemble_ocr_sections(self, imgs, middle, text):
 | 
			
		||||
        """
 | 
			
		||||
        Given a `middle` value and the text that middle page represents, we OCR
 | 
			
		||||
        the remainder of the document and return the whole thing.
 | 
			
		||||
        """
 | 
			
		||||
        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
 | 
			
		||||
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
        return text
 | 
			
		||||
 | 
			
		||||
    def _ocr(self, imgs, lang):
 | 
			
		||||
        """
 | 
			
		||||
        Performs a single OCR attempt.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not imgs:
 | 
			
		||||
            return ""
 | 
			
		||||
 | 
			
		||||
        self.log("info", "Parsing for {}".format(lang))
 | 
			
		||||
 | 
			
		||||
        with Pool(processes=self.THREADS) as pool:
 | 
			
		||||
            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
 | 
			
		||||
            r = " ".join(r)
 | 
			
		||||
 | 
			
		||||
        # Strip out excess white space to allow matching to go smoother
 | 
			
		||||
        return strip_excess_whitespace(r)
 | 
			
		||||
        # Return the parser with the highest weight.
 | 
			
		||||
        return sorted(
 | 
			
		||||
            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 | 
			
		||||
 | 
			
		||||
    def _store(self, text, doc, thumbnail):
 | 
			
		||||
 | 
			
		||||
@ -332,10 +208,6 @@ class Consumer(object):
 | 
			
		||||
 | 
			
		||||
        return document
 | 
			
		||||
 | 
			
		||||
    def _cleanup_tempdir(self, d):
 | 
			
		||||
        self.log("debug", "Deleting directory {}".format(d))
 | 
			
		||||
        shutil.rmtree(d)
 | 
			
		||||
 | 
			
		||||
    def _cleanup_doc(self, doc):
 | 
			
		||||
        self.log("debug", "Deleting document {}".format(doc))
 | 
			
		||||
        os.unlink(doc)
 | 
			
		||||
@ -361,41 +233,3 @@ class Consumer(object):
 | 
			
		||||
        with open(doc, "rb") as f:
 | 
			
		||||
            checksum = hashlib.md5(f.read()).hexdigest()
 | 
			
		||||
        return Document.objects.filter(checksum=checksum).exists()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def strip_excess_whitespace(text):
 | 
			
		||||
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
 | 
			
		||||
    no_leading_whitespace = re.sub(
 | 
			
		||||
        "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
 | 
			
		||||
    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
 | 
			
		||||
    return no_trailing_whitespace
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def image_to_string(args):
 | 
			
		||||
    img, lang = args
 | 
			
		||||
    ocr = pyocr.get_available_tools()[0]
 | 
			
		||||
    with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
 | 
			
		||||
        if ocr.can_detect_orientation():
 | 
			
		||||
            try:
 | 
			
		||||
                orientation = ocr.detect_orientation(f, lang=lang)
 | 
			
		||||
                f = f.rotate(orientation["angle"], expand=1)
 | 
			
		||||
            except (TesseractError, OtherTesseractError):
 | 
			
		||||
                pass
 | 
			
		||||
        return ocr.image_to_string(f, lang=lang)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_unpaper(args):
 | 
			
		||||
    unpaper, pnm = args
 | 
			
		||||
    subprocess.Popen(
 | 
			
		||||
        (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(*args):
 | 
			
		||||
 | 
			
		||||
    environment = os.environ.copy()
 | 
			
		||||
    if settings.CONVERT_MEMORY_LIMIT:
 | 
			
		||||
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
 | 
			
		||||
    if settings.CONVERT_TMPDIR:
 | 
			
		||||
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
 | 
			
		||||
 | 
			
		||||
    subprocess.Popen(args, env=environment).wait()
 | 
			
		||||
 | 
			
		||||
@ -158,13 +158,22 @@ class Document(models.Model):
 | 
			
		||||
 | 
			
		||||
    correspondent = models.ForeignKey(
 | 
			
		||||
        Correspondent, blank=True, null=True, related_name="documents")
 | 
			
		||||
 | 
			
		||||
    title = models.CharField(max_length=128, blank=True, db_index=True)
 | 
			
		||||
    content = models.TextField(db_index=True)
 | 
			
		||||
 | 
			
		||||
    content = models.TextField(
 | 
			
		||||
        db_index=True,
 | 
			
		||||
        blank=True,
 | 
			
		||||
        help_text="The raw, text-only data of the document.  This field is "
 | 
			
		||||
                  "primarily used for searching."
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    file_type = models.CharField(
 | 
			
		||||
        max_length=4,
 | 
			
		||||
        editable=False,
 | 
			
		||||
        choices=tuple([(t, t.upper()) for t in TYPES])
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    tags = models.ManyToManyField(
 | 
			
		||||
        Tag, related_name="documents", blank=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,45 @@
 | 
			
		||||
import logging
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ParseError(Exception):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DocumentParser(object):
 | 
			
		||||
    """
 | 
			
		||||
    Subclass this to make your own parser.  Have a look at
 | 
			
		||||
    `paperless_tesseract.parsers` for inspiration.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    SCRATCH = settings.SCRATCH_DIR
 | 
			
		||||
 | 
			
		||||
    def __init__(self, path):
 | 
			
		||||
        self.document_path = path
 | 
			
		||||
        self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
 | 
			
		||||
        self.logger = logging.getLogger(__name__)
 | 
			
		||||
        self.logging_group = None
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self):
 | 
			
		||||
        """
 | 
			
		||||
        Returns the path to a file we can use as a thumbnail for this document.
 | 
			
		||||
        """
 | 
			
		||||
        raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
    def get_text(self):
 | 
			
		||||
        """
 | 
			
		||||
        Returns the text from the document and only the text.
 | 
			
		||||
        """
 | 
			
		||||
        raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
    def log(self, level, message):
 | 
			
		||||
        getattr(self.logger, level)(message, extra={
 | 
			
		||||
            "group": self.logging_group
 | 
			
		||||
        })
 | 
			
		||||
 | 
			
		||||
    def cleanup(self):
 | 
			
		||||
        self.log("debug", "Deleting directory {}".format(self.tempdir))
 | 
			
		||||
        shutil.rmtree(self.tempdir)
 | 
			
		||||
@ -2,3 +2,4 @@ from django.dispatch import Signal
 | 
			
		||||
 | 
			
		||||
document_consumption_started = Signal(providing_args=["filename"])
 | 
			
		||||
document_consumption_finished = Signal(providing_args=["document"])
 | 
			
		||||
document_consumer_declaration = Signal(providing_args=[])
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,5 @@
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from subprocess import Popen
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
@ -158,7 +158,7 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
<script>
 | 
			
		||||
  // We nee to re-build the select-all functionality as the old logic pointed
 | 
			
		||||
  // We need to re-build the select-all functionality as the old logic pointed
 | 
			
		||||
  // to a table and we're using divs now.
 | 
			
		||||
  django.jQuery("#action-toggle").on("change", function(){
 | 
			
		||||
    django.jQuery(".grid .box .result .checkbox input")
 | 
			
		||||
 | 
			
		||||
@ -1,13 +1,6 @@
 | 
			
		||||
import os
 | 
			
		||||
from unittest import mock, skipIf
 | 
			
		||||
 | 
			
		||||
import pyocr
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from pyocr.libtesseract.tesseract_raw import \
 | 
			
		||||
    TesseractError as OtherTesseractError
 | 
			
		||||
 | 
			
		||||
from ..models import FileInfo
 | 
			
		||||
from ..consumer import image_to_string, strip_excess_whitespace
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestAttributes(TestCase):
 | 
			
		||||
@ -308,71 +301,3 @@ class TestFieldPermutations(TestCase):
 | 
			
		||||
                        }
 | 
			
		||||
                        self._test_guessed_attributes(
 | 
			
		||||
                            template.format(**spec), **spec)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FakeTesseract(object):
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def can_detect_orientation():
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def detect_orientation(file_handle, lang):
 | 
			
		||||
        raise OtherTesseractError("arbitrary status", "message")
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def image_to_string(file_handle, lang):
 | 
			
		||||
        return "This is test text"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FakePyOcr(object):
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_available_tools():
 | 
			
		||||
        return [FakeTesseract]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestOCR(TestCase):
 | 
			
		||||
 | 
			
		||||
    text_cases = [
 | 
			
		||||
        ("simple     string", "simple string"),
 | 
			
		||||
        (
 | 
			
		||||
            "simple    newline\n   testing string",
 | 
			
		||||
            "simple newline\ntesting string"
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            "utf-8   строка с пробелами в конце  ",
 | 
			
		||||
            "utf-8 строка с пробелами в конце"
 | 
			
		||||
        )
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
 | 
			
		||||
    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
 | 
			
		||||
 | 
			
		||||
    def test_strip_excess_whitespace(self):
 | 
			
		||||
        for source, result in self.text_cases:
 | 
			
		||||
            actual_result = strip_excess_whitespace(source)
 | 
			
		||||
            self.assertEqual(
 | 
			
		||||
                result,
 | 
			
		||||
                actual_result,
 | 
			
		||||
                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
 | 
			
		||||
                    source,
 | 
			
		||||
                    result,
 | 
			
		||||
                    actual_result
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
 | 
			
		||||
    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
 | 
			
		||||
    @mock.patch("documents.consumer.pyocr", FakePyOcr)
 | 
			
		||||
    def test_image_to_string_with_text_free_page(self):
 | 
			
		||||
        """
 | 
			
		||||
        This test is sort of silly, since it's really just reproducing an odd
 | 
			
		||||
        exception thrown by pyocr when it encounters a page with no text.
 | 
			
		||||
        Actually running this test against an installation of Tesseract results
 | 
			
		||||
        in a segmentation fault rooted somewhere deep inside pyocr where I
 | 
			
		||||
        don't care to dig.  Regardless, if you run the consumer normally,
 | 
			
		||||
        text-free pages are now handled correctly so long as we work around
 | 
			
		||||
        this weird exception.
 | 
			
		||||
        """
 | 
			
		||||
        image_to_string(["no-text.png", "en"])
 | 
			
		||||
 | 
			
		||||
@ -61,6 +61,7 @@ INSTALLED_APPS = [
 | 
			
		||||
    "django_extensions",
 | 
			
		||||
 | 
			
		||||
    "documents.apps.DocumentsConfig",
 | 
			
		||||
    "paperless_tesseract.apps.PaperlessTesseractConfig",
 | 
			
		||||
 | 
			
		||||
    "flat_responsive",
 | 
			
		||||
    "django.contrib.admin",
 | 
			
		||||
@ -70,6 +71,9 @@ INSTALLED_APPS = [
 | 
			
		||||
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
if os.getenv("PAPERLESS_INSTALLED_APPS"):
 | 
			
		||||
    INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
 | 
			
		||||
 | 
			
		||||
MIDDLEWARE_CLASSES = [
 | 
			
		||||
    'django.middleware.security.SecurityMiddleware',
 | 
			
		||||
    'django.contrib.sessions.middleware.SessionMiddleware',
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = (0, 3, 5)
 | 
			
		||||
__version__ = (0, 3, 6)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										0
									
								
								src/paperless_tesseract/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_tesseract/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_tesseract/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_tesseract/apps.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,16 @@
 | 
			
		||||
from django.apps import AppConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PaperlessTesseractConfig(AppConfig):
 | 
			
		||||
 | 
			
		||||
    name = "paperless_tesseract"
 | 
			
		||||
 | 
			
		||||
    def ready(self):
 | 
			
		||||
 | 
			
		||||
        from documents.signals import document_consumer_declaration
 | 
			
		||||
 | 
			
		||||
        from .signals import ConsumerDeclaration
 | 
			
		||||
 | 
			
		||||
        document_consumer_declaration.connect(ConsumerDeclaration.handle)
 | 
			
		||||
 | 
			
		||||
        AppConfig.ready(self)
 | 
			
		||||
							
								
								
									
										214
									
								
								src/paperless_tesseract/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										214
									
								
								src/paperless_tesseract/parsers.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,214 @@
 | 
			
		||||
import itertools
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import subprocess
 | 
			
		||||
from multiprocessing.pool import Pool
 | 
			
		||||
 | 
			
		||||
import langdetect
 | 
			
		||||
import pyocr
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from pyocr.libtesseract.tesseract_raw import \
 | 
			
		||||
    TesseractError as OtherTesseractError
 | 
			
		||||
from pyocr.tesseract import TesseractError
 | 
			
		||||
 | 
			
		||||
from .languages import ISO639
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class OCRError(Exception):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
    """
 | 
			
		||||
    This parser uses Tesseract to try and get some text out of a rasterised
 | 
			
		||||
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    CONVERT = settings.CONVERT_BINARY
 | 
			
		||||
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self):
 | 
			
		||||
        """
 | 
			
		||||
        The thumbnail of a PDF is just a 500px wide image of the first page.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        run_convert(
 | 
			
		||||
            self.CONVERT,
 | 
			
		||||
            "-scale", "500x5000",
 | 
			
		||||
            "-alpha", "remove",
 | 
			
		||||
            self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        return os.path.join(self.tempdir, "convert-0000.png")
 | 
			
		||||
 | 
			
		||||
    def get_text(self):
 | 
			
		||||
 | 
			
		||||
        images = self._get_greyscale()
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
 | 
			
		||||
            return self._get_ocr(images)
 | 
			
		||||
        except OCRError as e:
 | 
			
		||||
            raise ParseError(e)
 | 
			
		||||
 | 
			
		||||
    def _get_greyscale(self):
 | 
			
		||||
        """
 | 
			
		||||
        Greyscale images are easier for Tesseract to OCR
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # Convert PDF to multiple PNMs
 | 
			
		||||
        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
 | 
			
		||||
        run_convert(
 | 
			
		||||
            self.CONVERT,
 | 
			
		||||
            "-density", str(self.DENSITY),
 | 
			
		||||
            "-depth", "8",
 | 
			
		||||
            "-type", "grayscale",
 | 
			
		||||
            self.document_path, pnm,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Get a list of converted images
 | 
			
		||||
        pnms = []
 | 
			
		||||
        for f in os.listdir(self.tempdir):
 | 
			
		||||
            if f.endswith(".pnm"):
 | 
			
		||||
                pnms.append(os.path.join(self.tempdir, f))
 | 
			
		||||
 | 
			
		||||
        # Run unpaper in parallel on converted images
 | 
			
		||||
        with Pool(processes=self.THREADS) as pool:
 | 
			
		||||
            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
 | 
			
		||||
 | 
			
		||||
        # Return list of converted images, processed with unpaper
 | 
			
		||||
        pnms = []
 | 
			
		||||
        for f in os.listdir(self.tempdir):
 | 
			
		||||
            if f.endswith(".unpaper.pnm"):
 | 
			
		||||
                pnms.append(os.path.join(self.tempdir, f))
 | 
			
		||||
 | 
			
		||||
        return sorted(filter(lambda __: os.path.isfile(__), pnms))
 | 
			
		||||
 | 
			
		||||
    def _guess_language(self, text):
 | 
			
		||||
        try:
 | 
			
		||||
            guess = langdetect.detect(text)
 | 
			
		||||
            self.log("debug", "Language detected: {}".format(guess))
 | 
			
		||||
            return guess
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.log("warning", "Language detection error: {}".format(e))
 | 
			
		||||
 | 
			
		||||
    def _get_ocr(self, imgs):
 | 
			
		||||
        """
 | 
			
		||||
        Attempts to do the best job possible OCR'ing the document based on
 | 
			
		||||
        simple language detection trial & error.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not imgs:
 | 
			
		||||
            raise OCRError("No images found")
 | 
			
		||||
 | 
			
		||||
        self.log("info", "OCRing the document")
 | 
			
		||||
 | 
			
		||||
        # Since the division gets rounded down by int, this calculation works
 | 
			
		||||
        # for every edge-case, i.e. 1
 | 
			
		||||
        middle = int(len(imgs) / 2)
 | 
			
		||||
        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
 | 
			
		||||
        guessed_language = self._guess_language(raw_text)
 | 
			
		||||
 | 
			
		||||
        if not guessed_language or guessed_language not in ISO639:
 | 
			
		||||
            self.log("warning", "Language detection failed!")
 | 
			
		||||
            if settings.FORGIVING_OCR:
 | 
			
		||||
                self.log(
 | 
			
		||||
                    "warning",
 | 
			
		||||
                    "As FORGIVING_OCR is enabled, we're going to make the "
 | 
			
		||||
                    "best with what we have."
 | 
			
		||||
                )
 | 
			
		||||
                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
			
		||||
                return raw_text
 | 
			
		||||
            raise OCRError("Language detection failed")
 | 
			
		||||
 | 
			
		||||
        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
 | 
			
		||||
            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
			
		||||
            return raw_text
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            return self._ocr(imgs, ISO639[guessed_language])
 | 
			
		||||
        except pyocr.pyocr.tesseract.TesseractError:
 | 
			
		||||
            if settings.FORGIVING_OCR:
 | 
			
		||||
                self.log(
 | 
			
		||||
                    "warning",
 | 
			
		||||
                    "OCR for {} failed, but we're going to stick with what "
 | 
			
		||||
                    "we've got since FORGIVING_OCR is enabled.".format(
 | 
			
		||||
                        guessed_language
 | 
			
		||||
                    )
 | 
			
		||||
                )
 | 
			
		||||
                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
 | 
			
		||||
                return raw_text
 | 
			
		||||
            raise OCRError(
 | 
			
		||||
                "The guessed language is not available in this instance of "
 | 
			
		||||
                "Tesseract."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def _ocr(self, imgs, lang):
 | 
			
		||||
        """
 | 
			
		||||
        Performs a single OCR attempt.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not imgs:
 | 
			
		||||
            return ""
 | 
			
		||||
 | 
			
		||||
        self.log("info", "Parsing for {}".format(lang))
 | 
			
		||||
 | 
			
		||||
        with Pool(processes=self.THREADS) as pool:
 | 
			
		||||
            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
 | 
			
		||||
            r = " ".join(r)
 | 
			
		||||
 | 
			
		||||
        # Strip out excess white space to allow matching to go smoother
 | 
			
		||||
        return strip_excess_whitespace(r)
 | 
			
		||||
 | 
			
		||||
    def _assemble_ocr_sections(self, imgs, middle, text):
 | 
			
		||||
        """
 | 
			
		||||
        Given a `middle` value and the text that middle page represents, we OCR
 | 
			
		||||
        the remainder of the document and return the whole thing.
 | 
			
		||||
        """
 | 
			
		||||
        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
 | 
			
		||||
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
        return text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(*args):
 | 
			
		||||
 | 
			
		||||
    environment = os.environ.copy()
 | 
			
		||||
    if settings.CONVERT_MEMORY_LIMIT:
 | 
			
		||||
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
 | 
			
		||||
    if settings.CONVERT_TMPDIR:
 | 
			
		||||
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
 | 
			
		||||
 | 
			
		||||
    subprocess.Popen(args, env=environment).wait()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_unpaper(args):
 | 
			
		||||
    unpaper, pnm = args
 | 
			
		||||
    subprocess.Popen(
 | 
			
		||||
        (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def strip_excess_whitespace(text):
 | 
			
		||||
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
 | 
			
		||||
    no_leading_whitespace = re.sub(
 | 
			
		||||
        "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
 | 
			
		||||
    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
 | 
			
		||||
    return no_trailing_whitespace
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def image_to_string(args):
 | 
			
		||||
    img, lang = args
 | 
			
		||||
    ocr = pyocr.get_available_tools()[0]
 | 
			
		||||
    with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
 | 
			
		||||
        if ocr.can_detect_orientation():
 | 
			
		||||
            try:
 | 
			
		||||
                orientation = ocr.detect_orientation(f, lang=lang)
 | 
			
		||||
                f = f.rotate(orientation["angle"], expand=1)
 | 
			
		||||
            except (TesseractError, OtherTesseractError):
 | 
			
		||||
                pass
 | 
			
		||||
        return ocr.image_to_string(f, lang=lang)
 | 
			
		||||
							
								
								
									
										23
									
								
								src/paperless_tesseract/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_tesseract/signals.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,23 @@
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from .parsers import RasterisedDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ConsumerDeclaration(object):
 | 
			
		||||
 | 
			
		||||
    MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def handle(cls, sender, **kwargs):
 | 
			
		||||
        return cls.test
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def test(cls, doc):
 | 
			
		||||
 | 
			
		||||
        if cls.MATCHING_FILES.match(doc):
 | 
			
		||||
            return {
 | 
			
		||||
                "parser": RasterisedDocumentParser,
 | 
			
		||||
                "weight": 0
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        return None
 | 
			
		||||
							
								
								
									
										0
									
								
								src/paperless_tesseract/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_tesseract/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
		 Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB  | 
							
								
								
									
										80
									
								
								src/paperless_tesseract/tests/test_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								src/paperless_tesseract/tests/test_ocr.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,80 @@
 | 
			
		||||
import os
 | 
			
		||||
from unittest import mock, skipIf
 | 
			
		||||
 | 
			
		||||
import pyocr
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from pyocr.libtesseract.tesseract_raw import \
 | 
			
		||||
    TesseractError as OtherTesseractError
 | 
			
		||||
 | 
			
		||||
from ..parsers import image_to_string, strip_excess_whitespace
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FakeTesseract(object):
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def can_detect_orientation():
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def detect_orientation(file_handle, lang):
 | 
			
		||||
        raise OtherTesseractError("arbitrary status", "message")
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def image_to_string(file_handle, lang):
 | 
			
		||||
        return "This is test text"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FakePyOcr(object):
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_available_tools():
 | 
			
		||||
        return [FakeTesseract]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestOCR(TestCase):
 | 
			
		||||
 | 
			
		||||
    text_cases = [
 | 
			
		||||
        ("simple     string", "simple string"),
 | 
			
		||||
        (
 | 
			
		||||
            "simple    newline\n   testing string",
 | 
			
		||||
            "simple newline\ntesting string"
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            "utf-8   строка с пробелами в конце  ",
 | 
			
		||||
            "utf-8 строка с пробелами в конце"
 | 
			
		||||
        )
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
 | 
			
		||||
    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
 | 
			
		||||
 | 
			
		||||
    def test_strip_excess_whitespace(self):
 | 
			
		||||
        for source, result in self.text_cases:
 | 
			
		||||
            actual_result = strip_excess_whitespace(source)
 | 
			
		||||
            self.assertEqual(
 | 
			
		||||
                result,
 | 
			
		||||
                actual_result,
 | 
			
		||||
                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
 | 
			
		||||
                    source,
 | 
			
		||||
                    result,
 | 
			
		||||
                    actual_result
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
 | 
			
		||||
    def test_image_to_string_with_text_free_page(self):
 | 
			
		||||
        """
 | 
			
		||||
        This test is sort of silly, since it's really just reproducing an odd
 | 
			
		||||
        exception thrown by pyocr when it encounters a page with no text.
 | 
			
		||||
        Actually running this test against an installation of Tesseract results
 | 
			
		||||
        in a segmentation fault rooted somewhere deep inside pyocr where I
 | 
			
		||||
        don't care to dig.  Regardless, if you run the consumer normally,
 | 
			
		||||
        text-free pages are now handled correctly so long as we work around
 | 
			
		||||
        this weird exception.
 | 
			
		||||
        """
 | 
			
		||||
        image_to_string(["no-text.png", "en"])
 | 
			
		||||
@ -5,7 +5,7 @@
 | 
			
		||||
 | 
			
		||||
[tox]
 | 
			
		||||
skipsdist = True
 | 
			
		||||
envlist = py34, py35, pep8
 | 
			
		||||
envlist = py34, py35, py36, pep8
 | 
			
		||||
 | 
			
		||||
[testenv]
 | 
			
		||||
commands = {envpython} manage.py test
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user