More changes to fixutres, etc

This commit is contained in:
Trenton H 2024-06-26 10:53:50 -07:00
parent c59e444d23
commit 4d53dff978
5 changed files with 99 additions and 59 deletions

View File

@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
return default_thumbnail_path return default_thumbnail_path
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.
""" """
out_path = os.path.join(temp_dir, "convert.webp") out_path = temp_dir / "convert.webp"
# Run convert to get a decent thumbnail # Run convert to get a decent thumbnail
try: try:
@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
auto_orient=True, auto_orient=True,
use_cropbox=True, use_cropbox=True,
input_file=f"{in_path}[0]", input_file=f"{in_path}[0]",
output_file=out_path, output_file=str(out_path),
logging_group=logging_group, logging_group=logging_group,
) )
except ParseError as e: except ParseError as e:

View File

@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser):
return PdfAFormat.A3b return PdfAFormat.A3b
return None return None
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): def get_thumbnail(
self,
document_path: Path,
mime_type: str,
file_name=None,
) -> Path:
if not self.archive_path: if not self.archive_path:
self.archive_path = self.generate_pdf( self.archive_path = self.generate_pdf(
self.parse_file_to_message(document_path), self.parse_file_to_message(document_path),

View File

@ -25,6 +25,11 @@ def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf" return sample_dir / "simple_text.eml.pdf"
@pytest.fixture(scope="session")
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf.webp"
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def html_email_file(sample_dir: Path) -> Path: def html_email_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml" return sample_dir / "html.eml"
@ -35,11 +40,26 @@ def html_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf" return sample_dir / "html.eml.pdf"
@pytest.fixture(scope="session")
def html_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf.webp"
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def html_email_html_file(sample_dir: Path) -> Path: def html_email_html_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.html" return sample_dir / "html.eml.html"
@pytest.fixture(scope="session")
def merged_pdf_first(sample_dir: Path) -> Path:
return sample_dir / "first.pdf"
@pytest.fixture(scope="session")
def merged_pdf_second(sample_dir: Path) -> Path:
return sample_dir / "second.pdf"
@pytest.fixture() @pytest.fixture()
def mail_parser() -> MailDocumentParser: def mail_parser() -> MailDocumentParser:
return MailDocumentParser(logging_group=None) return MailDocumentParser(logging_group=None)

View File

@ -5,6 +5,7 @@ from pathlib import Path
import httpx import httpx
import pytest import pytest
from django.test.html import parse_html from django.test.html import parse_html
from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock from pytest_httpx import HTTPXMock
from pytest_mock import MockerFixture from pytest_mock import MockerFixture
@ -328,7 +329,11 @@ class TestTikaHtmlParse:
with pytest.raises(ParseError): with pytest.raises(ParseError):
mail_parser.tika_parse(html) mail_parser.tika_parse(html)
def test_tika_parse_unreachable(self, mail_parser: MailDocumentParser): def test_tika_parse_unreachable(
self,
settings: SettingsWrapper,
mail_parser: MailDocumentParser,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -341,7 +346,7 @@ class TestTikaHtmlParse:
# Check if exception is raised when Tika cannot be reached. # Check if exception is raised when Tika cannot be reached.
with pytest.raises(ParseError): with pytest.raises(ParseError):
mail_parser.tika_server = "" settings.TIKA_ENDPOINT = "http://does-not-exist:9998"
mail_parser.tika_parse(html) mail_parser.tika_parse(html)

View File

@ -3,17 +3,15 @@ import shutil
import subprocess import subprocess
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from unittest import mock
import httpx import httpx
import pytest import pytest
from django.test import TestCase
from imagehash import average_hash from imagehash import average_hash
from PIL import Image from PIL import Image
from pytest_mock import MockerFixture
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import util_call_with_backoff from documents.tests.utils import util_call_with_backoff
from paperless_mail.tests.test_parsers import BaseMailParserTestCase from paperless_mail.parsers import MailDocumentParser
def extract_text(pdf_path: Path) -> str: def extract_text(pdf_path: Path) -> str:
@ -50,7 +48,7 @@ class MailAttachmentMock:
"PAPERLESS_CI_TEST" not in os.environ, "PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with", reason="No Gotenberg/Tika servers to test with",
) )
class TestUrlCanary(TestCase): class TestUrlCanary:
""" """
Verify certain URLs are still available so testing is valid still Verify certain URLs are still available so testing is valid still
""" """
@ -69,13 +67,13 @@ class TestUrlCanary(TestCase):
whether this image stays online forever, so here we check if we can detect if is not whether this image stays online forever, so here we check if we can detect if is not
available anymore. available anymore.
""" """
with self.assertRaises(httpx.HTTPStatusError) as cm: with pytest.raises(httpx.HTTPStatusError) as exec_info:
resp = httpx.get( resp = httpx.get(
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png", "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
) )
resp.raise_for_status() resp.raise_for_status()
self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND) assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
def test_is_online_image_still_available(self): def test_is_online_image_still_available(self):
""" """
@ -100,13 +98,19 @@ class TestUrlCanary(TestCase):
"PAPERLESS_CI_TEST" not in os.environ, "PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with", reason="No Gotenberg/Tika servers to test with",
) )
class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): class TestParserLive:
@staticmethod @staticmethod
def imagehash(file, hash_size=18): def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}" return f"{average_hash(Image.open(file), hash_size)}"
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") def test_get_thumbnail(
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock): self,
mocker: MockerFixture,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
simple_txt_email_pdf_file: Path,
simple_txt_email_thumbnail_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
THEN: THEN:
- The returned thumbnail image file is as expected - The returned thumbnail image file is as expected
""" """
mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf" mock_generate_pdf = mocker.patch(
thumb = self.parser.get_thumbnail( "paperless_mail.parsers.MailDocumentParser.generate_pdf",
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
) )
self.assertIsFile(thumb) mock_generate_pdf.return_value = simple_txt_email_pdf_file
expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp" thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
self.assertEqual( assert thumb.exists()
self.imagehash(thumb), assert thumb.is_file()
self.imagehash(expected),
f"Created Thumbnail {thumb} differs from expected file {expected}",
)
def test_tika_parse_successful(self): assert (
self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file)
), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
expected_text = "Some Text" expected_text = "Some Text"
# Check successful parsing # Check successful parsing
parsed = self.parser.tika_parse(html) parsed = mail_parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip()) assert expected_text == parsed.strip()
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_gotenberg_merging( def test_generate_pdf_gotenberg_merging(
self, self,
mock_generate_pdf_from_html: mock.MagicMock, mocker: MockerFixture,
mock_generate_pdf_from_mail: mock.MagicMock, mail_parser: MailDocumentParser,
html_email_file: Path,
merged_pdf_first: Path,
merged_pdf_second: Path,
): ):
""" """
GIVEN: GIVEN:
@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
THEN: THEN:
- gotenberg is called to merge files and the resulting file is returned - gotenberg is called to merge files and the resulting file is returned
""" """
mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf" mock_generate_pdf_from_html = mocker.patch(
mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf" "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
msg = self.parser.parse_file_to_message(
self.SAMPLE_DIR / "html.eml",
) )
mock_generate_pdf_from_mail = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
)
mock_generate_pdf_from_mail.return_value = merged_pdf_first
mock_generate_pdf_from_html.return_value = merged_pdf_second
msg = mail_parser.parse_file_to_message(html_email_file)
_, pdf_path = util_call_with_backoff( _, pdf_path = util_call_with_backoff(
self.parser.generate_pdf, mail_parser.generate_pdf,
[msg], [msg],
) )
self.assertIsFile(pdf_path) assert pdf_path.exists()
assert pdf_path.is_file()
extracted = extract_text(pdf_path) extracted = extract_text(pdf_path)
expected = ( expected = (
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c" "first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
) )
self.assertEqual(expected, extracted) assert expected == extracted
def test_generate_pdf_from_mail(self): def test_generate_pdf_from_mail(
self,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
html_email_thumbnail_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
WHEN: WHEN:
- pdf generation from simple eml file is requested - pdf generation from simple eml file is requested
THEN: THEN:
- gotenberg is called and the resulting file is returned and look as expected. - Gotenberg is called and the resulting file is returned and look as expected.
""" """
util_call_with_backoff( util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
self.parser.parse,
[self.SAMPLE_DIR / "html.eml", "message/rfc822"],
)
# Check the archive PDF # Check the archive PDF
archive_path = self.parser.get_archive_path() archive_path = mail_parser.get_archive_path()
archive_text = extract_text(archive_path) archive_text = extract_text(archive_path)
expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf") expected_archive_text = extract_text(html_email_pdf_file)
# Archive includes the HTML content, so use in # Archive includes the HTML content, so use in
self.assertIn(expected_archive_text, archive_text) assert expected_archive_text in archive_text
# Check the thumbnail # Check the thumbnail
generated_thumbnail = self.parser.get_thumbnail( generated_thumbnail = mail_parser.get_thumbnail(
self.SAMPLE_DIR / "html.eml", html_email_file,
"message/rfc822", "message/rfc822",
) )
generated_thumbnail_hash = self.imagehash(generated_thumbnail) generated_thumbnail_hash = self.imagehash(generated_thumbnail)
# The created pdf is not reproducible. But the converted image should always look the same. # The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp") expected_hash = self.imagehash(html_email_thumbnail_file)
self.assertEqual( assert (
generated_thumbnail_hash, generated_thumbnail_hash == expected_hash
expected_hash, ), f"PDF looks different. Check if {generated_thumbnail} looks weird."
f"PDF looks different. Check if {generated_thumbnail} looks weird.",
)