diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 30687680c..e7b263ed8 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -418,6 +418,15 @@ Insurances/ # Insurances Defining a storage path is optional. If no storage path is defined for a document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied. +## Automatic recovery of invalid PDFs {#pdf-recovery} + +Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type +detection is incorrect. This can happen if the PDF is not properly formatted or contains errors. + +!!! warning + + This process will technically modify the document before processing. + ## Celery Monitoring {#celery-monitoring} The monitoring tool diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 57277e4a6..0bc335b8b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -539,6 +539,29 @@ class ConsumerPlugin( self.log.debug(f"Detected mime type: {mime_type}") + if ( + Path(self.filename).suffix.lower() == ".pdf" + and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES + ): + try: + # The file might be a pdf, but the mime type is wrong. + # Try to clean with qpdf + self.log.debug( + "Detected possible PDF with wrong mime type, trying to clean with qpdf", + ) + run_subprocess( + [ + "qpdf", + "--replace-input", + self.working_copy, + ], + logger=self.log, + ) + mime_type = magic.from_file(self.working_copy, mime=True) + self.log.debug(f"Detected mime type after qpdf: {mime_type}") + except Exception as e: + self.log.error(f"Error attempting to clean PDF: {e}") + # Based on the mime type, get the parser for that type parser_class: Optional[type[DocumentParser]] = ( get_parser_class_for_mime_type( diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 737d1256f..30f3dd26d 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1389,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer): mime_type = magic.from_buffer(document_data, mime=True) if not is_mime_type_supported(mime_type): - raise serializers.ValidationError( - _("File type %(type)s not supported") % {"type": mime_type}, - ) + if ( + mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES + and document.name.endswith( + ".pdf", + ) + ): + # If the file is an invalid PDF, we can try to recover it later in the consumer + mime_type = "application/pdf" + else: + raise serializers.ValidationError( + _("File type %(type)s not supported") % {"type": mime_type}, + ) return document.name, document_data diff --git a/src/documents/tests/samples/invalid_pdf.pdf b/src/documents/tests/samples/invalid_pdf.pdf new file mode 100644 index 000000000..f226c2d84 Binary files /dev/null and b/src/documents/tests/samples/invalid_pdf.pdf differ diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index ee2e8ee1e..b1cd43932 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): self.assertEqual(overrides.filename, "simple.pdf") self.assertEqual(overrides.custom_field_ids, [custom_field.id]) + def test_upload_invalid_pdf(self): + """ + GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES + WHEN: Upload the file + THEN: The file is not rejected + """ + self.consume_file_mock.return_value = celery.result.AsyncResult( + id=str(uuid.uuid4()), + ) + + with open( + os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"), + "rb", + ) as f: + response = self.client.post( + "/api/documents/post_document/", + {"document": f}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + def test_get_metadata(self): doc = Document.objects.create( title="test", diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 5b56e2cca..aa452e15b 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser): def fake_magic_from_file(file, mime=False): if mime: + if file.name.startswith("invalid_pdf"): + return "application/octet-stream" if os.path.splitext(file)[1] == ".pdf": return "application/pdf" elif os.path.splitext(file)[1] == ".png": @@ -952,6 +954,27 @@ class TestConsumer( sanity_check() + @mock.patch("documents.consumer.run_subprocess") + def test_try_to_clean_invalid_pdf(self, m): + shutil.copy( + Path(__file__).parent / "samples" / "invalid_pdf.pdf", + settings.CONSUMPTION_DIR / "invalid_pdf.pdf", + ) + with self.get_consumer( + settings.CONSUMPTION_DIR / "invalid_pdf.pdf", + ) as consumer: + # fails because no qpdf + self.assertRaises(ConsumerError, consumer.run) + + m.assert_called_once() + + args, _ = m.call_args + + command = args[0] + + self.assertEqual(command[0], "qpdf") + self.assertEqual(command[1], "--replace-input") + @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase): diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 851fe6217..2da0b49f1 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -960,6 +960,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT", ) +CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",) + OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES") # The default language that tesseract will attempt to use when parsing