From 87f6f12fae743fa7f81ea933ac47960b703ba19c Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Thu, 31 Aug 2023 14:19:51 -0700 Subject: [PATCH] Differentiate generic vs parse errors during consumption --- src/documents/consumer.py | 14 ++++-- src/documents/tests/test_consumer.py | 45 +++++++++++++++++++- src/paperless_tesseract/tests/test_parser.py | 13 ++++++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 863376505..59c4b7d85 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -34,6 +34,7 @@ from .models import DocumentType from .models import FileInfo from .models import Tag from .parsers import DocumentParser +from .parsers import ParseError from .parsers import get_parser_class_for_mime_type from .parsers import parse_date from .signals import document_consumption_finished @@ -448,12 +449,19 @@ class Consumer(LoggingMixin): date = parse_date(self.filename, text) archive_path = document_parser.get_archive_path() + except ParseError as e: + self._fail( + str(e), + f"Error occurred while consuming document {self.filename}: {e}", + exc_info=True, + exception=e, + ) except Exception as e: document_parser.cleanup() tempdir.cleanup() self._fail( str(e), - f"Error while consuming document {self.filename}: {e}", + f"Unexpected error while consuming document {self.filename}: {e}", exc_info=True, exception=e, ) @@ -543,8 +551,8 @@ class Consumer(LoggingMixin): except Exception as e: self._fail( str(e), - f"The following error occurred while consuming " - f"{self.filename}: {e}", + f"The following error occurred while storing document " + f"{self.filename} after consuming: {e}", exc_info=True, exception=e, ) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a8f427c37..a9cb887de 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -211,6 +211,18 @@ class FaultyParser(DocumentParser): raise ParseError("Does not compute.") +class FaultyGenericExceptionParser(DocumentParser): + def __init__(self, logging_group, scratch_dir): + super().__init__(logging_group) + _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) + + def get_thumbnail(self, document_path, mime_type, file_name=None): + return self.fake_thumb + + def parse(self, document_path, mime_type, file_name=None): + raise Exception("Generic exception.") + + def fake_magic_from_file(file, mime=False): if mime: if os.path.splitext(file)[1] == ".pdf": @@ -260,6 +272,13 @@ class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def make_faulty_parser(self, logging_group, progress_callback=None): return FaultyParser(logging_group, self.dirs.scratch_dir) + def make_faulty_generic_exception_parser( + self, + logging_group, + progress_callback=None, + ): + return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir) + def setUp(self): super().setUp() @@ -496,7 +515,29 @@ class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertRaisesMessage( ConsumerError, - "sample.pdf: Error while consuming document sample.pdf: Does not compute.", + "sample.pdf: Error occurred while consuming document sample.pdf: Does not compute.", + self.consumer.try_consume_file, + self.get_test_file(), + ) + + self._assert_first_last_send_progress(last_status="FAILED") + + @mock.patch("documents.parsers.document_consumer_declaration.send") + def testGenericParserException(self, m): + m.return_value = [ + ( + None, + { + "parser": self.make_faulty_generic_exception_parser, + "mime_types": {"application/pdf": ".pdf"}, + "weight": 0, + }, + ), + ] + + self.assertRaisesMessage( + ConsumerError, + "sample.pdf: Unexpected error while consuming document sample.pdf: Generic exception.", self.consumer.try_consume_file, self.get_test_file(), ) @@ -510,7 +551,7 @@ class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertRaisesMessage( ConsumerError, - "sample.pdf: The following error occurred while consuming sample.pdf: NO.", + "sample.pdf: The following error occurred while storing document sample.pdf after consuming: NO.", self.consumer.try_consume_file, filename, ) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 8b3de5615..606453904 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -8,6 +8,7 @@ from unittest import mock from django.test import TestCase from django.test import override_settings +from ocrmypdf import SubprocessOutputError from documents.parsers import ParseError from documents.parsers import run_convert @@ -827,6 +828,18 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): # Copied from the PDF to here. Don't even look at it self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text()) + @mock.patch("ocrmypdf.ocr") + def test_gs_rendering_error(self, m): + m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed") + parser = RasterisedDocumentParser(None) + + self.assertRaises( + ParseError, + parser.parse, + os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), + "application/pdf", + ) + class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase): SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")