More changes to fixutres, etc

This commit is contained in:
Trenton H 2024-06-26 10:53:50 -07:00
parent c59e444d23
commit 4d53dff978
5 changed files with 99 additions and 59 deletions

View File

@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
return default_thumbnail_path
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(temp_dir, "convert.webp")
out_path = temp_dir / "convert.webp"
# Run convert to get a decent thumbnail
try:
@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
auto_orient=True,
use_cropbox=True,
input_file=f"{in_path}[0]",
output_file=out_path,
output_file=str(out_path),
logging_group=logging_group,
)
except ParseError as e:

View File

@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser):
return PdfAFormat.A3b
return None
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
def get_thumbnail(
self,
document_path: Path,
mime_type: str,
file_name=None,
) -> Path:
if not self.archive_path:
self.archive_path = self.generate_pdf(
self.parse_file_to_message(document_path),

View File

@ -25,6 +25,11 @@ def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf"
@pytest.fixture(scope="session")
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml"
@ -35,11 +40,26 @@ def html_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf"
@pytest.fixture(scope="session")
def html_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_html_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.html"
@pytest.fixture(scope="session")
def merged_pdf_first(sample_dir: Path) -> Path:
return sample_dir / "first.pdf"
@pytest.fixture(scope="session")
def merged_pdf_second(sample_dir: Path) -> Path:
return sample_dir / "second.pdf"
@pytest.fixture()
def mail_parser() -> MailDocumentParser:
return MailDocumentParser(logging_group=None)

View File

@ -5,6 +5,7 @@ from pathlib import Path
import httpx
import pytest
from django.test.html import parse_html
from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock
from pytest_mock import MockerFixture
@ -328,7 +329,11 @@ class TestTikaHtmlParse:
with pytest.raises(ParseError):
mail_parser.tika_parse(html)
def test_tika_parse_unreachable(self, mail_parser: MailDocumentParser):
def test_tika_parse_unreachable(
self,
settings: SettingsWrapper,
mail_parser: MailDocumentParser,
):
"""
GIVEN:
- Fresh start
@ -341,7 +346,7 @@ class TestTikaHtmlParse:
# Check if exception is raised when Tika cannot be reached.
with pytest.raises(ParseError):
mail_parser.tika_server = ""
settings.TIKA_ENDPOINT = "http://does-not-exist:9998"
mail_parser.tika_parse(html)

View File

@ -3,17 +3,15 @@ import shutil
import subprocess
import tempfile
from pathlib import Path
from unittest import mock
import httpx
import pytest
from django.test import TestCase
from imagehash import average_hash
from PIL import Image
from pytest_mock import MockerFixture
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import util_call_with_backoff
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
from paperless_mail.parsers import MailDocumentParser
def extract_text(pdf_path: Path) -> str:
@ -50,7 +48,7 @@ class MailAttachmentMock:
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestUrlCanary(TestCase):
class TestUrlCanary:
"""
Verify certain URLs are still available so testing is valid still
"""
@ -69,13 +67,13 @@ class TestUrlCanary(TestCase):
whether this image stays online forever, so here we check if we can detect if is not
available anymore.
"""
with self.assertRaises(httpx.HTTPStatusError) as cm:
with pytest.raises(httpx.HTTPStatusError) as exec_info:
resp = httpx.get(
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
)
resp.raise_for_status()
self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND)
assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
def test_is_online_image_still_available(self):
"""
@ -100,13 +98,19 @@ class TestUrlCanary(TestCase):
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
class TestParserLive:
@staticmethod
def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}"
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
def test_get_thumbnail(
self,
mocker: MockerFixture,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
simple_txt_email_pdf_file: Path,
simple_txt_email_thumbnail_file: Path,
):
"""
GIVEN:
- Fresh start
@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
THEN:
- The returned thumbnail image file is as expected
"""
mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf"
thumb = self.parser.get_thumbnail(
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
self.assertIsFile(thumb)
mock_generate_pdf.return_value = simple_txt_email_pdf_file
expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp"
thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
self.assertEqual(
self.imagehash(thumb),
self.imagehash(expected),
f"Created Thumbnail {thumb} differs from expected file {expected}",
)
assert thumb.exists()
assert thumb.is_file()
def test_tika_parse_successful(self):
assert (
self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file)
), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
"""
GIVEN:
- Fresh start
@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
expected_text = "Some Text"
# Check successful parsing
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
parsed = mail_parser.tika_parse(html)
assert expected_text == parsed.strip()
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_gotenberg_merging(
self,
mock_generate_pdf_from_html: mock.MagicMock,
mock_generate_pdf_from_mail: mock.MagicMock,
mocker: MockerFixture,
mail_parser: MailDocumentParser,
html_email_file: Path,
merged_pdf_first: Path,
merged_pdf_second: Path,
):
"""
GIVEN:
@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
THEN:
- gotenberg is called to merge files and the resulting file is returned
"""
mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf"
mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf"
msg = self.parser.parse_file_to_message(
self.SAMPLE_DIR / "html.eml",
mock_generate_pdf_from_html = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
)
mock_generate_pdf_from_mail = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
)
mock_generate_pdf_from_mail.return_value = merged_pdf_first
mock_generate_pdf_from_html.return_value = merged_pdf_second
msg = mail_parser.parse_file_to_message(html_email_file)
_, pdf_path = util_call_with_backoff(
self.parser.generate_pdf,
mail_parser.generate_pdf,
[msg],
)
self.assertIsFile(pdf_path)
assert pdf_path.exists()
assert pdf_path.is_file()
extracted = extract_text(pdf_path)
expected = (
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
)
self.assertEqual(expected, extracted)
assert expected == extracted
def test_generate_pdf_from_mail(self):
def test_generate_pdf_from_mail(
self,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
html_email_thumbnail_file: Path,
):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from simple eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and look as expected.
- Gotenberg is called and the resulting file is returned and look as expected.
"""
util_call_with_backoff(
self.parser.parse,
[self.SAMPLE_DIR / "html.eml", "message/rfc822"],
)
util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
# Check the archive PDF
archive_path = self.parser.get_archive_path()
archive_path = mail_parser.get_archive_path()
archive_text = extract_text(archive_path)
expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf")
expected_archive_text = extract_text(html_email_pdf_file)
# Archive includes the HTML content, so use in
self.assertIn(expected_archive_text, archive_text)
assert expected_archive_text in archive_text
# Check the thumbnail
generated_thumbnail = self.parser.get_thumbnail(
self.SAMPLE_DIR / "html.eml",
generated_thumbnail = mail_parser.get_thumbnail(
html_email_file,
"message/rfc822",
)
generated_thumbnail_hash = self.imagehash(generated_thumbnail)
# The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp")
expected_hash = self.imagehash(html_email_thumbnail_file)
self.assertEqual(
generated_thumbnail_hash,
expected_hash,
f"PDF looks different. Check if {generated_thumbnail} looks weird.",
)
assert (
generated_thumbnail_hash == expected_hash
), f"PDF looks different. Check if {generated_thumbnail} looks weird."