More changes to fixutres, etc
This commit is contained in:
parent
c59e444d23
commit
4d53dff978
@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
|
|||||||
return default_thumbnail_path
|
return default_thumbnail_path
|
||||||
|
|
||||||
|
|
||||||
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
|
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
|
||||||
"""
|
"""
|
||||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
"""
|
"""
|
||||||
out_path = os.path.join(temp_dir, "convert.webp")
|
out_path = temp_dir / "convert.webp"
|
||||||
|
|
||||||
# Run convert to get a decent thumbnail
|
# Run convert to get a decent thumbnail
|
||||||
try:
|
try:
|
||||||
@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
|
|||||||
auto_orient=True,
|
auto_orient=True,
|
||||||
use_cropbox=True,
|
use_cropbox=True,
|
||||||
input_file=f"{in_path}[0]",
|
input_file=f"{in_path}[0]",
|
||||||
output_file=out_path,
|
output_file=str(out_path),
|
||||||
logging_group=logging_group,
|
logging_group=logging_group,
|
||||||
)
|
)
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
|
@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser):
|
|||||||
return PdfAFormat.A3b
|
return PdfAFormat.A3b
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
|
def get_thumbnail(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
file_name=None,
|
||||||
|
) -> Path:
|
||||||
if not self.archive_path:
|
if not self.archive_path:
|
||||||
self.archive_path = self.generate_pdf(
|
self.archive_path = self.generate_pdf(
|
||||||
self.parse_file_to_message(document_path),
|
self.parse_file_to_message(document_path),
|
||||||
|
@ -25,6 +25,11 @@ def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
|
|||||||
return sample_dir / "simple_text.eml.pdf"
|
return sample_dir / "simple_text.eml.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "simple_text.eml.pdf.webp"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def html_email_file(sample_dir: Path) -> Path:
|
def html_email_file(sample_dir: Path) -> Path:
|
||||||
return sample_dir / "html.eml"
|
return sample_dir / "html.eml"
|
||||||
@ -35,11 +40,26 @@ def html_email_pdf_file(sample_dir: Path) -> Path:
|
|||||||
return sample_dir / "html.eml.pdf"
|
return sample_dir / "html.eml.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_thumbnail_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "html.eml.pdf.webp"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def html_email_html_file(sample_dir: Path) -> Path:
|
def html_email_html_file(sample_dir: Path) -> Path:
|
||||||
return sample_dir / "html.eml.html"
|
return sample_dir / "html.eml.html"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def merged_pdf_first(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "first.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def merged_pdf_second(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "second.pdf"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def mail_parser() -> MailDocumentParser:
|
def mail_parser() -> MailDocumentParser:
|
||||||
return MailDocumentParser(logging_group=None)
|
return MailDocumentParser(logging_group=None)
|
||||||
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||||||
import httpx
|
import httpx
|
||||||
import pytest
|
import pytest
|
||||||
from django.test.html import parse_html
|
from django.test.html import parse_html
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
from pytest_httpx import HTTPXMock
|
from pytest_httpx import HTTPXMock
|
||||||
from pytest_mock import MockerFixture
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
@ -328,7 +329,11 @@ class TestTikaHtmlParse:
|
|||||||
with pytest.raises(ParseError):
|
with pytest.raises(ParseError):
|
||||||
mail_parser.tika_parse(html)
|
mail_parser.tika_parse(html)
|
||||||
|
|
||||||
def test_tika_parse_unreachable(self, mail_parser: MailDocumentParser):
|
def test_tika_parse_unreachable(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- Fresh start
|
- Fresh start
|
||||||
@ -341,7 +346,7 @@ class TestTikaHtmlParse:
|
|||||||
|
|
||||||
# Check if exception is raised when Tika cannot be reached.
|
# Check if exception is raised when Tika cannot be reached.
|
||||||
with pytest.raises(ParseError):
|
with pytest.raises(ParseError):
|
||||||
mail_parser.tika_server = ""
|
settings.TIKA_ENDPOINT = "http://does-not-exist:9998"
|
||||||
mail_parser.tika_parse(html)
|
mail_parser.tika_parse(html)
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,17 +3,15 @@ import shutil
|
|||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import pytest
|
import pytest
|
||||||
from django.test import TestCase
|
|
||||||
from imagehash import average_hash
|
from imagehash import average_hash
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
|
||||||
from documents.tests.utils import util_call_with_backoff
|
from documents.tests.utils import util_call_with_backoff
|
||||||
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
def extract_text(pdf_path: Path) -> str:
|
def extract_text(pdf_path: Path) -> str:
|
||||||
@ -50,7 +48,7 @@ class MailAttachmentMock:
|
|||||||
"PAPERLESS_CI_TEST" not in os.environ,
|
"PAPERLESS_CI_TEST" not in os.environ,
|
||||||
reason="No Gotenberg/Tika servers to test with",
|
reason="No Gotenberg/Tika servers to test with",
|
||||||
)
|
)
|
||||||
class TestUrlCanary(TestCase):
|
class TestUrlCanary:
|
||||||
"""
|
"""
|
||||||
Verify certain URLs are still available so testing is valid still
|
Verify certain URLs are still available so testing is valid still
|
||||||
"""
|
"""
|
||||||
@ -69,13 +67,13 @@ class TestUrlCanary(TestCase):
|
|||||||
whether this image stays online forever, so here we check if we can detect if is not
|
whether this image stays online forever, so here we check if we can detect if is not
|
||||||
available anymore.
|
available anymore.
|
||||||
"""
|
"""
|
||||||
with self.assertRaises(httpx.HTTPStatusError) as cm:
|
with pytest.raises(httpx.HTTPStatusError) as exec_info:
|
||||||
resp = httpx.get(
|
resp = httpx.get(
|
||||||
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
|
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND)
|
assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
|
||||||
|
|
||||||
def test_is_online_image_still_available(self):
|
def test_is_online_image_still_available(self):
|
||||||
"""
|
"""
|
||||||
@ -100,13 +98,19 @@ class TestUrlCanary(TestCase):
|
|||||||
"PAPERLESS_CI_TEST" not in os.environ,
|
"PAPERLESS_CI_TEST" not in os.environ,
|
||||||
reason="No Gotenberg/Tika servers to test with",
|
reason="No Gotenberg/Tika servers to test with",
|
||||||
)
|
)
|
||||||
class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
class TestParserLive:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def imagehash(file, hash_size=18):
|
def imagehash(file, hash_size=18):
|
||||||
return f"{average_hash(Image.open(file), hash_size)}"
|
return f"{average_hash(Image.open(file), hash_size)}"
|
||||||
|
|
||||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
|
def test_get_thumbnail(
|
||||||
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
|
self,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
simple_txt_email_file: Path,
|
||||||
|
simple_txt_email_pdf_file: Path,
|
||||||
|
simple_txt_email_thumbnail_file: Path,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- Fresh start
|
- Fresh start
|
||||||
@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
|||||||
THEN:
|
THEN:
|
||||||
- The returned thumbnail image file is as expected
|
- The returned thumbnail image file is as expected
|
||||||
"""
|
"""
|
||||||
mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf"
|
mock_generate_pdf = mocker.patch(
|
||||||
thumb = self.parser.get_thumbnail(
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||||
self.SAMPLE_DIR / "simple_text.eml",
|
|
||||||
"message/rfc822",
|
|
||||||
)
|
)
|
||||||
self.assertIsFile(thumb)
|
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
||||||
|
|
||||||
expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp"
|
thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
|
||||||
|
|
||||||
self.assertEqual(
|
assert thumb.exists()
|
||||||
self.imagehash(thumb),
|
assert thumb.is_file()
|
||||||
self.imagehash(expected),
|
|
||||||
f"Created Thumbnail {thumb} differs from expected file {expected}",
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_tika_parse_successful(self):
|
assert (
|
||||||
|
self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file)
|
||||||
|
), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
|
||||||
|
|
||||||
|
def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- Fresh start
|
- Fresh start
|
||||||
@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
|||||||
expected_text = "Some Text"
|
expected_text = "Some Text"
|
||||||
|
|
||||||
# Check successful parsing
|
# Check successful parsing
|
||||||
parsed = self.parser.tika_parse(html)
|
parsed = mail_parser.tika_parse(html)
|
||||||
self.assertEqual(expected_text, parsed.strip())
|
assert expected_text == parsed.strip()
|
||||||
|
|
||||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
|
|
||||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
|
|
||||||
def test_generate_pdf_gotenberg_merging(
|
def test_generate_pdf_gotenberg_merging(
|
||||||
self,
|
self,
|
||||||
mock_generate_pdf_from_html: mock.MagicMock,
|
mocker: MockerFixture,
|
||||||
mock_generate_pdf_from_mail: mock.MagicMock,
|
mail_parser: MailDocumentParser,
|
||||||
|
html_email_file: Path,
|
||||||
|
merged_pdf_first: Path,
|
||||||
|
merged_pdf_second: Path,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
|||||||
THEN:
|
THEN:
|
||||||
- gotenberg is called to merge files and the resulting file is returned
|
- gotenberg is called to merge files and the resulting file is returned
|
||||||
"""
|
"""
|
||||||
mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf"
|
mock_generate_pdf_from_html = mocker.patch(
|
||||||
mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf"
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
|
||||||
|
|
||||||
msg = self.parser.parse_file_to_message(
|
|
||||||
self.SAMPLE_DIR / "html.eml",
|
|
||||||
)
|
)
|
||||||
|
mock_generate_pdf_from_mail = mocker.patch(
|
||||||
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
|
||||||
|
)
|
||||||
|
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
||||||
|
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
||||||
|
|
||||||
|
msg = mail_parser.parse_file_to_message(html_email_file)
|
||||||
|
|
||||||
_, pdf_path = util_call_with_backoff(
|
_, pdf_path = util_call_with_backoff(
|
||||||
self.parser.generate_pdf,
|
mail_parser.generate_pdf,
|
||||||
[msg],
|
[msg],
|
||||||
)
|
)
|
||||||
self.assertIsFile(pdf_path)
|
assert pdf_path.exists()
|
||||||
|
assert pdf_path.is_file()
|
||||||
|
|
||||||
extracted = extract_text(pdf_path)
|
extracted = extract_text(pdf_path)
|
||||||
expected = (
|
expected = (
|
||||||
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
|
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(expected, extracted)
|
assert expected == extracted
|
||||||
|
|
||||||
def test_generate_pdf_from_mail(self):
|
def test_generate_pdf_from_mail(
|
||||||
|
self,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
html_email_file: Path,
|
||||||
|
html_email_pdf_file: Path,
|
||||||
|
html_email_thumbnail_file: Path,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- Fresh start
|
- Fresh start
|
||||||
WHEN:
|
WHEN:
|
||||||
- pdf generation from simple eml file is requested
|
- pdf generation from simple eml file is requested
|
||||||
THEN:
|
THEN:
|
||||||
- gotenberg is called and the resulting file is returned and look as expected.
|
- Gotenberg is called and the resulting file is returned and look as expected.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
util_call_with_backoff(
|
util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
|
||||||
self.parser.parse,
|
|
||||||
[self.SAMPLE_DIR / "html.eml", "message/rfc822"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check the archive PDF
|
# Check the archive PDF
|
||||||
archive_path = self.parser.get_archive_path()
|
archive_path = mail_parser.get_archive_path()
|
||||||
archive_text = extract_text(archive_path)
|
archive_text = extract_text(archive_path)
|
||||||
expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf")
|
expected_archive_text = extract_text(html_email_pdf_file)
|
||||||
|
|
||||||
# Archive includes the HTML content, so use in
|
# Archive includes the HTML content, so use in
|
||||||
self.assertIn(expected_archive_text, archive_text)
|
assert expected_archive_text in archive_text
|
||||||
|
|
||||||
# Check the thumbnail
|
# Check the thumbnail
|
||||||
generated_thumbnail = self.parser.get_thumbnail(
|
generated_thumbnail = mail_parser.get_thumbnail(
|
||||||
self.SAMPLE_DIR / "html.eml",
|
html_email_file,
|
||||||
"message/rfc822",
|
"message/rfc822",
|
||||||
)
|
)
|
||||||
generated_thumbnail_hash = self.imagehash(generated_thumbnail)
|
generated_thumbnail_hash = self.imagehash(generated_thumbnail)
|
||||||
|
|
||||||
# The created pdf is not reproducible. But the converted image should always look the same.
|
# The created pdf is not reproducible. But the converted image should always look the same.
|
||||||
expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp")
|
expected_hash = self.imagehash(html_email_thumbnail_file)
|
||||||
|
|
||||||
self.assertEqual(
|
assert (
|
||||||
generated_thumbnail_hash,
|
generated_thumbnail_hash == expected_hash
|
||||||
expected_hash,
|
), f"PDF looks different. Check if {generated_thumbnail} looks weird."
|
||||||
f"PDF looks different. Check if {generated_thumbnail} looks weird.",
|
|
||||||
)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user