diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 09b1442c0..1297162e2 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) - return default_thumbnail_path -def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: +def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path: """ The thumbnail of a PDF is just a 500px wide image of the first page. """ - out_path = os.path.join(temp_dir, "convert.webp") + out_path = temp_dir / "convert.webp" # Run convert to get a decent thumbnail try: @@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: auto_orient=True, use_cropbox=True, input_file=f"{in_path}[0]", - output_file=out_path, + output_file=str(out_path), logging_group=logging_group, ) except ParseError as e: diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 9047b5f90..4e83844e2 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser): return PdfAFormat.A3b return None - def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): + def get_thumbnail( + self, + document_path: Path, + mime_type: str, + file_name=None, + ) -> Path: if not self.archive_path: self.archive_path = self.generate_pdf( self.parse_file_to_message(document_path), diff --git a/src/paperless_mail/tests/conftest.py b/src/paperless_mail/tests/conftest.py index 07b2a6dac..dcfa904a1 100644 --- a/src/paperless_mail/tests/conftest.py +++ b/src/paperless_mail/tests/conftest.py @@ -25,6 +25,11 @@ def simple_txt_email_pdf_file(sample_dir: Path) -> Path: return sample_dir / "simple_text.eml.pdf" +@pytest.fixture(scope="session") +def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path: + return sample_dir / "simple_text.eml.pdf.webp" + + @pytest.fixture(scope="session") def html_email_file(sample_dir: Path) -> Path: return sample_dir / "html.eml" @@ -35,11 +40,26 @@ def html_email_pdf_file(sample_dir: Path) -> Path: return sample_dir / "html.eml.pdf" +@pytest.fixture(scope="session") +def html_email_thumbnail_file(sample_dir: Path) -> Path: + return sample_dir / "html.eml.pdf.webp" + + @pytest.fixture(scope="session") def html_email_html_file(sample_dir: Path) -> Path: return sample_dir / "html.eml.html" +@pytest.fixture(scope="session") +def merged_pdf_first(sample_dir: Path) -> Path: + return sample_dir / "first.pdf" + + +@pytest.fixture(scope="session") +def merged_pdf_second(sample_dir: Path) -> Path: + return sample_dir / "second.pdf" + + @pytest.fixture() def mail_parser() -> MailDocumentParser: return MailDocumentParser(logging_group=None) diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index 1b81930e4..ddb10082b 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -5,6 +5,7 @@ from pathlib import Path import httpx import pytest from django.test.html import parse_html +from pytest_django.fixtures import SettingsWrapper from pytest_httpx import HTTPXMock from pytest_mock import MockerFixture @@ -328,7 +329,11 @@ class TestTikaHtmlParse: with pytest.raises(ParseError): mail_parser.tika_parse(html) - def test_tika_parse_unreachable(self, mail_parser: MailDocumentParser): + def test_tika_parse_unreachable( + self, + settings: SettingsWrapper, + mail_parser: MailDocumentParser, + ): """ GIVEN: - Fresh start @@ -341,7 +346,7 @@ class TestTikaHtmlParse: # Check if exception is raised when Tika cannot be reached. with pytest.raises(ParseError): - mail_parser.tika_server = "" + settings.TIKA_ENDPOINT = "http://does-not-exist:9998" mail_parser.tika_parse(html) diff --git a/src/paperless_mail/tests/test_parsers_live.py b/src/paperless_mail/tests/test_parsers_live.py index 3260725a5..9e13ad25e 100644 --- a/src/paperless_mail/tests/test_parsers_live.py +++ b/src/paperless_mail/tests/test_parsers_live.py @@ -3,17 +3,15 @@ import shutil import subprocess import tempfile from pathlib import Path -from unittest import mock import httpx import pytest -from django.test import TestCase from imagehash import average_hash from PIL import Image +from pytest_mock import MockerFixture -from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import util_call_with_backoff -from paperless_mail.tests.test_parsers import BaseMailParserTestCase +from paperless_mail.parsers import MailDocumentParser def extract_text(pdf_path: Path) -> str: @@ -50,7 +48,7 @@ class MailAttachmentMock: "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestUrlCanary(TestCase): +class TestUrlCanary: """ Verify certain URLs are still available so testing is valid still """ @@ -69,13 +67,13 @@ class TestUrlCanary(TestCase): whether this image stays online forever, so here we check if we can detect if is not available anymore. """ - with self.assertRaises(httpx.HTTPStatusError) as cm: + with pytest.raises(httpx.HTTPStatusError) as exec_info: resp = httpx.get( "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png", ) resp.raise_for_status() - self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND) + assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND def test_is_online_image_still_available(self): """ @@ -100,13 +98,19 @@ class TestUrlCanary(TestCase): "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): +class TestParserLive: @staticmethod def imagehash(file, hash_size=18): return f"{average_hash(Image.open(file), hash_size)}" - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") - def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock): + def test_get_thumbnail( + self, + mocker: MockerFixture, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + simple_txt_email_pdf_file: Path, + simple_txt_email_thumbnail_file: Path, + ): """ GIVEN: - Fresh start @@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): THEN: - The returned thumbnail image file is as expected """ - mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf" - thumb = self.parser.get_thumbnail( - self.SAMPLE_DIR / "simple_text.eml", - "message/rfc822", + mock_generate_pdf = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf", ) - self.assertIsFile(thumb) + mock_generate_pdf.return_value = simple_txt_email_pdf_file - expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp" + thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822") - self.assertEqual( - self.imagehash(thumb), - self.imagehash(expected), - f"Created Thumbnail {thumb} differs from expected file {expected}", - ) + assert thumb.exists() + assert thumb.is_file() - def test_tika_parse_successful(self): + assert ( + self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file) + ), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}" + + def test_tika_parse_successful(self, mail_parser: MailDocumentParser): """ GIVEN: - Fresh start @@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): expected_text = "Some Text" # Check successful parsing - parsed = self.parser.tika_parse(html) - self.assertEqual(expected_text, parsed.strip()) + parsed = mail_parser.tika_parse(html) + assert expected_text == parsed.strip() - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail") - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html") def test_generate_pdf_gotenberg_merging( self, - mock_generate_pdf_from_html: mock.MagicMock, - mock_generate_pdf_from_mail: mock.MagicMock, + mocker: MockerFixture, + mail_parser: MailDocumentParser, + html_email_file: Path, + merged_pdf_first: Path, + merged_pdf_second: Path, ): """ GIVEN: @@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): THEN: - gotenberg is called to merge files and the resulting file is returned """ - mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf" - mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf" - - msg = self.parser.parse_file_to_message( - self.SAMPLE_DIR / "html.eml", + mock_generate_pdf_from_html = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html", ) + mock_generate_pdf_from_mail = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail", + ) + mock_generate_pdf_from_mail.return_value = merged_pdf_first + mock_generate_pdf_from_html.return_value = merged_pdf_second + + msg = mail_parser.parse_file_to_message(html_email_file) _, pdf_path = util_call_with_backoff( - self.parser.generate_pdf, + mail_parser.generate_pdf, [msg], ) - self.assertIsFile(pdf_path) + assert pdf_path.exists() + assert pdf_path.is_file() extracted = extract_text(pdf_path) expected = ( "first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c" ) - self.assertEqual(expected, extracted) + assert expected == extracted - def test_generate_pdf_from_mail(self): + def test_generate_pdf_from_mail( + self, + mail_parser: MailDocumentParser, + html_email_file: Path, + html_email_pdf_file: Path, + html_email_thumbnail_file: Path, + ): """ GIVEN: - Fresh start WHEN: - pdf generation from simple eml file is requested THEN: - - gotenberg is called and the resulting file is returned and look as expected. + - Gotenberg is called and the resulting file is returned and look as expected. """ - util_call_with_backoff( - self.parser.parse, - [self.SAMPLE_DIR / "html.eml", "message/rfc822"], - ) + util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"]) # Check the archive PDF - archive_path = self.parser.get_archive_path() + archive_path = mail_parser.get_archive_path() archive_text = extract_text(archive_path) - expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf") + expected_archive_text = extract_text(html_email_pdf_file) # Archive includes the HTML content, so use in - self.assertIn(expected_archive_text, archive_text) + assert expected_archive_text in archive_text # Check the thumbnail - generated_thumbnail = self.parser.get_thumbnail( - self.SAMPLE_DIR / "html.eml", + generated_thumbnail = mail_parser.get_thumbnail( + html_email_file, "message/rfc822", ) generated_thumbnail_hash = self.imagehash(generated_thumbnail) # The created pdf is not reproducible. But the converted image should always look the same. - expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp") + expected_hash = self.imagehash(html_email_thumbnail_file) - self.assertEqual( - generated_thumbnail_hash, - expected_hash, - f"PDF looks different. Check if {generated_thumbnail} looks weird.", - ) + assert ( + generated_thumbnail_hash == expected_hash + ), f"PDF looks different. Check if {generated_thumbnail} looks weird."